blkcg: add blkg_policy_data->plid
[linux-3.10.git] / block / blk-core.c
index 795154e..c973249 100644 (file)
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
+#include <linux/delay.h>
+#include <linux/ratelimit.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-cgroup.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
-static int __make_request(struct request_queue *q, struct bio *bio);
+DEFINE_IDA(blk_queue_ida);
 
 /*
  * For the allocated request tables
@@ -216,12 +220,13 @@ static void blk_delay_work(struct work_struct *work)
  * Description:
  *   Sometimes queueing needs to be postponed for a little while, to allow
  *   resources to come back. This function will make sure that queueing is
- *   restarted around the specified time.
+ *   restarted around the specified time. Queue lock must be held.
  */
 void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
-       queue_delayed_work(kblockd_workqueue, &q->delay_work,
-                               msecs_to_jiffies(msecs));
+       if (likely(!blk_queue_dead(q)))
+               queue_delayed_work(kblockd_workqueue, &q->delay_work,
+                                  msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_delay_queue);
 
@@ -259,7 +264,7 @@ EXPORT_SYMBOL(blk_start_queue);
  **/
 void blk_stop_queue(struct request_queue *q)
 {
-       __cancel_delayed_work(&q->delay_work);
+       cancel_delayed_work(&q->delay_work);
        queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
@@ -279,7 +284,7 @@ EXPORT_SYMBOL(blk_stop_queue);
  *
  *     This function does not cancel any asynchronous activity arising
  *     out of elevator or throttling code. That would require elevaotor_exit()
- *     and blk_throtl_exit() to be called with queue lock initialized.
+ *     and blkcg_exit_queue() to be called with queue lock initialized.
  *
  */
 void blk_sync_queue(struct request_queue *q)
@@ -290,6 +295,34 @@ void blk_sync_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_sync_queue);
 
 /**
+ * __blk_run_queue_uncond - run a queue whether or not it has been stopped
+ * @q: The queue to run
+ *
+ * Description:
+ *    Invoke request handling on a queue if there are any pending requests.
+ *    May be used to restart request handling after a request has completed.
+ *    This variant runs the queue whether or not the queue has been
+ *    stopped. Must be called with the queue lock held and interrupts
+ *    disabled. See also @blk_run_queue.
+ */
+inline void __blk_run_queue_uncond(struct request_queue *q)
+{
+       if (unlikely(blk_queue_dead(q)))
+               return;
+
+       /*
+        * Some request_fn implementations, e.g. scsi_request_fn(), unlock
+        * the queue lock internally. As a result multiple threads may be
+        * running such a request function concurrently. Keep track of the
+        * number of active request_fn invocations such that blk_drain_queue()
+        * can wait until all these request_fn calls have finished.
+        */
+       q->request_fn_active++;
+       q->request_fn(q);
+       q->request_fn_active--;
+}
+
+/**
  * __blk_run_queue - run a single device queue
  * @q: The queue to run
  *
@@ -302,7 +335,7 @@ void __blk_run_queue(struct request_queue *q)
        if (unlikely(blk_queue_stopped(q)))
                return;
 
-       q->request_fn(q);
+       __blk_run_queue_uncond(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -312,14 +345,12 @@ EXPORT_SYMBOL(__blk_run_queue);
  *
  * Description:
  *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
- *    of us.
+ *    of us. The caller must hold the queue lock.
  */
 void blk_run_queue_async(struct request_queue *q)
 {
-       if (likely(!blk_queue_stopped(q))) {
-               __cancel_delayed_work(&q->delay_work);
-               queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
-       }
+       if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
+               mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
 EXPORT_SYMBOL(blk_run_queue_async);
 
@@ -347,59 +378,219 @@ void blk_put_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_put_queue);
 
-/*
- * Note: If a driver supplied the queue lock, it is disconnected
- * by this function. The actual state of the lock doesn't matter
- * here as the request_queue isn't accessible after this point
- * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
+/**
+ * __blk_drain_queue - drain requests from request_queue
+ * @q: queue to drain
+ * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
+ *
+ * Drain requests from @q.  If @drain_all is set, all requests are drained.
+ * If not, only ELVPRIV requests are drained.  The caller is responsible
+ * for ensuring that no new requests which need to be drained are queued.
  */
-void blk_cleanup_queue(struct request_queue *q)
+static void __blk_drain_queue(struct request_queue *q, bool drain_all)
+       __releases(q->queue_lock)
+       __acquires(q->queue_lock)
 {
+       int i;
+
+       lockdep_assert_held(q->queue_lock);
+
+       while (true) {
+               bool drain = false;
+
+               /*
+                * The caller might be trying to drain @q before its
+                * elevator is initialized.
+                */
+               if (q->elevator)
+                       elv_drain_elevator(q);
+
+               blkcg_drain_queue(q);
+
+               /*
+                * This function might be called on a queue which failed
+                * driver init after queue creation or is not yet fully
+                * active yet.  Some drivers (e.g. fd and loop) get unhappy
+                * in such cases.  Kick queue iff dispatch queue has
+                * something on it and @q has request_fn set.
+                */
+               if (!list_empty(&q->queue_head) && q->request_fn)
+                       __blk_run_queue(q);
+
+               drain |= q->nr_rqs_elvpriv;
+               drain |= q->request_fn_active;
+
+               /*
+                * Unfortunately, requests are queued at and tracked from
+                * multiple places and there's no single counter which can
+                * be drained.  Check all the queues and counters.
+                */
+               if (drain_all) {
+                       drain |= !list_empty(&q->queue_head);
+                       for (i = 0; i < 2; i++) {
+                               drain |= q->nr_rqs[i];
+                               drain |= q->in_flight[i];
+                               drain |= !list_empty(&q->flush_queue[i]);
+                       }
+               }
+
+               if (!drain)
+                       break;
+
+               spin_unlock_irq(q->queue_lock);
+
+               msleep(10);
+
+               spin_lock_irq(q->queue_lock);
+       }
+
        /*
-        * We know we have process context here, so we can be a little
-        * cautious and ensure that pending block actions on this device
-        * are done before moving on. Going into this function, we should
-        * not have processes doing IO to this device.
+        * With queue marked dead, any woken up waiter will fail the
+        * allocation path, so the wakeup chaining is lost and we're
+        * left with hung waiters. We need to wake up those waiters.
         */
-       blk_sync_queue(q);
+       if (q->request_fn) {
+               struct request_list *rl;
 
-       del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+               blk_queue_for_each_rl(rl, q)
+                       for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+                               wake_up_all(&rl->wait[i]);
+       }
+}
+
+/**
+ * blk_queue_bypass_start - enter queue bypass mode
+ * @q: queue of interest
+ *
+ * In bypass mode, only the dispatch FIFO queue of @q is used.  This
+ * function makes @q enter bypass mode and drains all requests which were
+ * throttled or issued before.  On return, it's guaranteed that no request
+ * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
+ * inside queue or RCU read lock.
+ */
+void blk_queue_bypass_start(struct request_queue *q)
+{
+       bool drain;
+
+       spin_lock_irq(q->queue_lock);
+       drain = !q->bypass_depth++;
+       queue_flag_set(QUEUE_FLAG_BYPASS, q);
+       spin_unlock_irq(q->queue_lock);
+
+       if (drain) {
+               spin_lock_irq(q->queue_lock);
+               __blk_drain_queue(q, false);
+               spin_unlock_irq(q->queue_lock);
+
+               /* ensure blk_queue_bypass() is %true inside RCU read lock */
+               synchronize_rcu();
+       }
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
+
+/**
+ * blk_queue_bypass_end - leave queue bypass mode
+ * @q: queue of interest
+ *
+ * Leave bypass mode and restore the normal queueing behavior.
+ */
+void blk_queue_bypass_end(struct request_queue *q)
+{
+       spin_lock_irq(q->queue_lock);
+       if (!--q->bypass_depth)
+               queue_flag_clear(QUEUE_FLAG_BYPASS, q);
+       WARN_ON_ONCE(q->bypass_depth < 0);
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
+
+/**
+ * blk_cleanup_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
+ * put it.  All future requests will be failed immediately with -ENODEV.
+ */
+void blk_cleanup_queue(struct request_queue *q)
+{
+       spinlock_t *lock = q->queue_lock;
+
+       /* mark @q DYING, no new request or merges will be allowed afterwards */
        mutex_lock(&q->sysfs_lock);
-       queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
+       queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);
+       spin_lock_irq(lock);
+
+       /*
+        * A dying queue is permanently in bypass mode till released.  Note
+        * that, unlike blk_queue_bypass_start(), we aren't performing
+        * synchronize_rcu() after entering bypass mode to avoid the delay
+        * as some drivers create and destroy a lot of queues while
+        * probing.  This is still safe because blk_release_queue() will be
+        * called only after the queue refcnt drops to zero and nothing,
+        * RCU or not, would be traversing the queue by then.
+        */
+       q->bypass_depth++;
+       queue_flag_set(QUEUE_FLAG_BYPASS, q);
+
+       queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+       queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+       queue_flag_set(QUEUE_FLAG_DYING, q);
+       spin_unlock_irq(lock);
        mutex_unlock(&q->sysfs_lock);
 
+       /*
+        * Drain all requests queued before DYING marking. Set DEAD flag to
+        * prevent that q->request_fn() gets invoked after draining finished.
+        */
+       spin_lock_irq(lock);
+       __blk_drain_queue(q, true);
+       queue_flag_set(QUEUE_FLAG_DEAD, q);
+       spin_unlock_irq(lock);
+
+       /* @q won't process any more request, flush async actions */
+       del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
+       blk_sync_queue(q);
+
+       spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
+       spin_unlock_irq(lock);
 
+       /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 
-static int blk_init_free_list(struct request_queue *q)
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+               gfp_t gfp_mask)
 {
-       struct request_list *rl = &q->rq;
-
        if (unlikely(rl->rq_pool))
                return 0;
 
+       rl->q = q;
        rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
        rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
-       rl->elvpriv = 0;
        init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
        init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
        rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-                               mempool_free_slab, request_cachep, q->node);
-
+                                         mempool_free_slab, request_cachep,
+                                         gfp_mask, q->node);
        if (!rl->rq_pool)
                return -ENOMEM;
 
        return 0;
 }
 
+void blk_exit_rl(struct request_list *rl)
+{
+       if (rl->rq_pool)
+               mempool_destroy(rl->rq_pool);
+}
+
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
-       return blk_alloc_queue_node(gfp_mask, -1);
+       return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
@@ -413,27 +604,30 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
 
+       q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
+       if (q->id < 0)
+               goto fail_q;
+
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
        q->backing_dev_info.state = 0;
        q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
        q->backing_dev_info.name = "block";
+       q->node = node_id;
 
        err = bdi_init(&q->backing_dev_info);
-       if (err) {
-               kmem_cache_free(blk_requestq_cachep, q);
-               return NULL;
-       }
-
-       if (blk_throtl_init(q)) {
-               kmem_cache_free(blk_requestq_cachep, q);
-               return NULL;
-       }
+       if (err)
+               goto fail_id;
 
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+       INIT_LIST_HEAD(&q->queue_head);
        INIT_LIST_HEAD(&q->timeout_list);
+       INIT_LIST_HEAD(&q->icq_list);
+#ifdef CONFIG_BLK_CGROUP
+       INIT_LIST_HEAD(&q->blkg_list);
+#endif
        INIT_LIST_HEAD(&q->flush_queue[0]);
        INIT_LIST_HEAD(&q->flush_queue[1]);
        INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -450,7 +644,25 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
         */
        q->queue_lock = &q->__queue_lock;
 
+       /*
+        * A queue starts its life with bypass turned on to avoid
+        * unnecessary bypass on/off overhead and nasty surprises during
+        * init.  The initial bypass will be finished when the queue is
+        * registered by blk_register_queue().
+        */
+       q->bypass_depth = 1;
+       __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+
+       if (blkcg_init_queue(q))
+               goto fail_id;
+
        return q;
+
+fail_id:
+       ida_simple_remove(&blk_queue_ida, q->id);
+fail_q:
+       kmem_cache_free(blk_requestq_cachep, q);
+       return NULL;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 
@@ -489,7 +701,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
 
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
-       return blk_init_queue_node(rfn, lock, -1);
+       return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(blk_init_queue);
 
@@ -502,7 +714,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
        if (!uninit_q)
                return NULL;
 
-       q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+       q = blk_init_allocated_queue(uninit_q, rfn, lock);
        if (!q)
                blk_cleanup_queue(uninit_q);
 
@@ -514,25 +726,16 @@ struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                         spinlock_t *lock)
 {
-       return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
-                             spinlock_t *lock, int node_id)
-{
        if (!q)
                return NULL;
 
-       q->node = node_id;
-       if (blk_init_free_list(q))
+       if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                return NULL;
 
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unprep_rq_fn         = NULL;
-       q->queue_flags          = QUEUE_FLAG_DEFAULT;
+       q->queue_flags          |= QUEUE_FLAG_DEFAULT;
 
        /* Override internal queue lock with supplied lock pointer */
        if (lock)
@@ -541,61 +744,37 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
        /*
         * This also sets hw/phys segments, boundary and size
         */
-       blk_queue_make_request(q, __make_request);
+       blk_queue_make_request(q, blk_queue_bio);
 
        q->sg_reserved_size = INT_MAX;
 
-       /*
-        * all done
-        */
-       if (!elevator_init(q, NULL)) {
-               blk_queue_congestion_threshold(q);
-               return q;
-       }
-
-       return NULL;
+       /* init elevator */
+       if (elevator_init(q, NULL))
+               return NULL;
+       return q;
 }
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
 
-int blk_get_queue(struct request_queue *q)
+bool blk_get_queue(struct request_queue *q)
 {
-       if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-               kobject_get(&q->kobj);
-               return 0;
+       if (likely(!blk_queue_dying(q))) {
+               __blk_get_queue(q);
+               return true;
        }
 
-       return 1;
+       return false;
 }
 EXPORT_SYMBOL(blk_get_queue);
 
-static inline void blk_free_request(struct request_queue *q, struct request *rq)
+static inline void blk_free_request(struct request_list *rl, struct request *rq)
 {
-       if (rq->cmd_flags & REQ_ELVPRIV)
-               elv_put_request(q, rq);
-       mempool_free(rq, q->rq.rq_pool);
-}
-
-static struct request *
-blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
-{
-       struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
-
-       if (!rq)
-               return NULL;
-
-       blk_rq_init(q, rq);
-
-       rq->cmd_flags = flags | REQ_ALLOCED;
-
-       if (priv) {
-               if (unlikely(elv_set_request(q, rq, gfp_mask))) {
-                       mempool_free(rq, q->rq.rq_pool);
-                       return NULL;
-               }
-               rq->cmd_flags |= REQ_ELVPRIV;
+       if (rq->cmd_flags & REQ_ELVPRIV) {
+               elv_put_request(rl->q, rq);
+               if (rq->elv.icq)
+                       put_io_context(rq->elv.icq->ioc);
        }
 
-       return rq;
+       mempool_free(rq, rl->rq_pool);
 }
 
 /*
@@ -632,18 +811,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
        ioc->last_waited = jiffies;
 }
 
-static void __freed_request(struct request_queue *q, int sync)
+static void __freed_request(struct request_list *rl, int sync)
 {
-       struct request_list *rl = &q->rq;
+       struct request_queue *q = rl->q;
 
-       if (rl->count[sync] < queue_congestion_off_threshold(q))
+       /*
+        * bdi isn't aware of blkcg yet.  As all async IOs end up root
+        * blkcg anyway, just use root blkcg state.
+        */
+       if (rl == &q->root_rl &&
+           rl->count[sync] < queue_congestion_off_threshold(q))
                blk_clear_queue_congested(q, sync);
 
        if (rl->count[sync] + 1 <= q->nr_requests) {
                if (waitqueue_active(&rl->wait[sync]))
                        wake_up(&rl->wait[sync]);
 
-               blk_clear_queue_full(q, sync);
+               blk_clear_rl_full(rl, sync);
        }
 }
 
@@ -651,18 +835,20 @@ static void __freed_request(struct request_queue *q, int sync)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_queue *q, int sync, int priv)
+static void freed_request(struct request_list *rl, unsigned int flags)
 {
-       struct request_list *rl = &q->rq;
+       struct request_queue *q = rl->q;
+       int sync = rw_is_sync(flags);
 
+       q->nr_rqs[sync]--;
        rl->count[sync]--;
-       if (priv)
-               rl->elvpriv--;
+       if (flags & REQ_ELVPRIV)
+               q->nr_rqs_elvpriv--;
 
-       __freed_request(q, sync);
+       __freed_request(rl, sync);
 
        if (unlikely(rl->starved[sync ^ 1]))
-               __freed_request(q, sync ^ 1);
+               __freed_request(rl, sync ^ 1);
 }
 
 /*
@@ -684,19 +870,49 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
        return true;
 }
 
-/*
- * Get a free request, queue_lock must be held.
- * Returns NULL on failure, with queue_lock held.
- * Returns !NULL on success, with queue_lock *not held*.
+/**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
  */
-static struct request *get_request(struct request_queue *q, int rw_flags,
-                                  struct bio *bio, gfp_t gfp_mask)
+static struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+       if (bio && bio->bi_ioc)
+               return bio->bi_ioc;
+#endif
+       return current->io_context;
+}
+
+/**
+ * __get_request - get a free request
+ * @rl: request list to allocate from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
+ *
+ * Get a free request from @q.  This function may fail under memory
+ * pressure or if @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
+ */
+static struct request *__get_request(struct request_list *rl, int rw_flags,
+                                    struct bio *bio, gfp_t gfp_mask)
 {
-       struct request *rq = NULL;
-       struct request_list *rl = &q->rq;
-       struct io_context *ioc = NULL;
+       struct request_queue *q = rl->q;
+       struct request *rq;
+       struct elevator_type *et = q->elevator->type;
+       struct io_context *ioc = rq_ioc(bio);
+       struct io_cq *icq = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
-       int may_queue, priv = 0;
+       int may_queue;
+
+       if (unlikely(blk_queue_dying(q)))
+               return NULL;
 
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
@@ -704,16 +920,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 
        if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
                if (rl->count[is_sync]+1 >= q->nr_requests) {
-                       ioc = current_io_context(GFP_ATOMIC, q->node);
                        /*
                         * The queue will fill after this allocation, so set
                         * it as full, and mark this process as "batching".
                         * This process will be allowed to complete a batch of
                         * requests, others will be blocked.
                         */
-                       if (!blk_queue_full(q, is_sync)) {
+                       if (!blk_rl_full(rl, is_sync)) {
                                ioc_set_batching(q, ioc);
-                               blk_set_queue_full(q, is_sync);
+                               blk_set_rl_full(rl, is_sync);
                        } else {
                                if (may_queue != ELV_MQUEUE_MUST
                                                && !ioc_batching(q, ioc)) {
@@ -722,11 +937,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
                                         * process is not a "batcher", and not
                                         * exempted by the IO scheduler
                                         */
-                                       goto out;
+                                       return NULL;
                                }
                        }
                }
-               blk_set_queue_congested(q, is_sync);
+               /*
+                * bdi isn't aware of blkcg yet.  As all async IOs end up
+                * root blkcg anyway, just use root blkcg state.
+                */
+               if (rl == &q->root_rl)
+                       blk_set_queue_congested(q, is_sync);
        }
 
        /*
@@ -735,47 +955,60 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
         * allocated with any setting of ->nr_requests
         */
        if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-               goto out;
+               return NULL;
 
+       q->nr_rqs[is_sync]++;
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
 
-       if (blk_rq_should_init_elevator(bio)) {
-               priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
-               if (priv)
-                       rl->elvpriv++;
+       /*
+        * Decide whether the new request will be managed by elevator.  If
+        * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
+        * prevent the current elevator from being destroyed until the new
+        * request is freed.  This guarantees icq's won't be destroyed and
+        * makes creating new ones safe.
+        *
+        * Also, lookup icq while holding queue_lock.  If it doesn't exist,
+        * it will be created after releasing queue_lock.
+        */
+       if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
+               rw_flags |= REQ_ELVPRIV;
+               q->nr_rqs_elvpriv++;
+               if (et->icq_cache && ioc)
+                       icq = ioc_lookup_icq(ioc, q);
        }
 
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
        spin_unlock_irq(q->queue_lock);
 
-       rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
-       if (unlikely(!rq)) {
-               /*
-                * Allocation failed presumably due to memory. Undo anything
-                * we might have messed up.
-                *
-                * Allocating task should really be put onto the front of the
-                * wait queue, but this is pretty rare.
-                */
-               spin_lock_irq(q->queue_lock);
-               freed_request(q, is_sync, priv);
+       /* allocate and init request */
+       rq = mempool_alloc(rl->rq_pool, gfp_mask);
+       if (!rq)
+               goto fail_alloc;
 
-               /*
-                * in the very unlikely event that allocation failed and no
-                * requests for this direction was pending, mark us starved
-                * so that freeing of a request in the other direction will
-                * notice us. another possible fix would be to split the
-                * rq mempool into READ and WRITE
-                */
-rq_starved:
-               if (unlikely(rl->count[is_sync] == 0))
-                       rl->starved[is_sync] = 1;
+       blk_rq_init(q, rq);
+       blk_rq_set_rl(rq, rl);
+       rq->cmd_flags = rw_flags | REQ_ALLOCED;
+
+       /* init elvpriv */
+       if (rw_flags & REQ_ELVPRIV) {
+               if (unlikely(et->icq_cache && !icq)) {
+                       if (ioc)
+                               icq = ioc_create_icq(ioc, q, gfp_mask);
+                       if (!icq)
+                               goto fail_elvpriv;
+               }
 
-               goto out;
-       }
+               rq->elv.icq = icq;
+               if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
+                       goto fail_elvpriv;
 
+               /* @rq->elv.icq holds io_context until @rq is freed */
+               if (icq)
+                       get_io_context(icq->ioc);
+       }
+out:
        /*
         * ioc may be NULL here, and ioc_batching will be false. That's
         * OK, if the queue is under the request limit then requests need
@@ -786,71 +1019,118 @@ rq_starved:
                ioc->nr_batch_requests--;
 
        trace_block_getrq(q, bio, rw_flags & 1);
-out:
        return rq;
+
+fail_elvpriv:
+       /*
+        * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
+        * and may fail indefinitely under memory pressure and thus
+        * shouldn't stall IO.  Treat this request as !elvpriv.  This will
+        * disturb iosched and blkcg but weird is bettern than dead.
+        */
+       printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
+                          dev_name(q->backing_dev_info.dev));
+
+       rq->cmd_flags &= ~REQ_ELVPRIV;
+       rq->elv.icq = NULL;
+
+       spin_lock_irq(q->queue_lock);
+       q->nr_rqs_elvpriv--;
+       spin_unlock_irq(q->queue_lock);
+       goto out;
+
+fail_alloc:
+       /*
+        * Allocation failed presumably due to memory. Undo anything we
+        * might have messed up.
+        *
+        * Allocating task should really be put onto the front of the wait
+        * queue, but this is pretty rare.
+        */
+       spin_lock_irq(q->queue_lock);
+       freed_request(rl, rw_flags);
+
+       /*
+        * in the very unlikely event that allocation failed and no
+        * requests for this direction was pending, mark us starved so that
+        * freeing of a request in the other direction will notice
+        * us. another possible fix would be to split the rq mempool into
+        * READ and WRITE
+        */
+rq_starved:
+       if (unlikely(rl->count[is_sync] == 0))
+               rl->starved[is_sync] = 1;
+       return NULL;
 }
 
-/*
- * No available requests for this queue, wait for some requests to become
- * available.
- *
- * Called with q->queue_lock held, and returns with it unlocked.
+/**
+ * get_request - get a free request
+ * @q: request_queue to allocate request from
+ * @rw_flags: RW and SYNC flags
+ * @bio: bio to allocate request for (can be %NULL)
+ * @gfp_mask: allocation mask
+ *
+ * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
+ * function keeps retrying under memory pressure and fails iff @q is dead.
+ *
+ * Must be callled with @q->queue_lock held and,
+ * Returns %NULL on failure, with @q->queue_lock held.
+ * Returns !%NULL on success, with @q->queue_lock *not held*.
  */
-static struct request *get_request_wait(struct request_queue *q, int rw_flags,
-                                       struct bio *bio)
+static struct request *get_request(struct request_queue *q, int rw_flags,
+                                  struct bio *bio, gfp_t gfp_mask)
 {
        const bool is_sync = rw_is_sync(rw_flags) != 0;
+       DEFINE_WAIT(wait);
+       struct request_list *rl;
        struct request *rq;
 
-       rq = get_request(q, rw_flags, bio, GFP_NOIO);
-       while (!rq) {
-               DEFINE_WAIT(wait);
-               struct io_context *ioc;
-               struct request_list *rl = &q->rq;
+       rl = blk_get_rl(q, bio);        /* transferred to @rq on success */
+retry:
+       rq = __get_request(rl, rw_flags, bio, gfp_mask);
+       if (rq)
+               return rq;
 
-               prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
-                               TASK_UNINTERRUPTIBLE);
+       if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+               blk_put_rl(rl);
+               return NULL;
+       }
 
-               trace_block_sleeprq(q, bio, rw_flags & 1);
+       /* wait on @rl and retry */
+       prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
+                                 TASK_UNINTERRUPTIBLE);
 
-               spin_unlock_irq(q->queue_lock);
-               io_schedule();
+       trace_block_sleeprq(q, bio, rw_flags & 1);
 
-               /*
-                * After sleeping, we become a "batching" process and
-                * will be able to allocate at least one request, and
-                * up to a big batch of them for a small period time.
-                * See ioc_batching, ioc_set_batching
-                */
-               ioc = current_io_context(GFP_NOIO, q->node);
-               ioc_set_batching(q, ioc);
+       spin_unlock_irq(q->queue_lock);
+       io_schedule();
 
-               spin_lock_irq(q->queue_lock);
-               finish_wait(&rl->wait[is_sync], &wait);
+       /*
+        * After sleeping, we become a "batching" process and will be able
+        * to allocate at least one request, and up to a big batch of them
+        * for a small period time.  See ioc_batching, ioc_set_batching
+        */
+       ioc_set_batching(q, current->io_context);
 
-               rq = get_request(q, rw_flags, bio, GFP_NOIO);
-       };
+       spin_lock_irq(q->queue_lock);
+       finish_wait(&rl->wait[is_sync], &wait);
 
-       return rq;
+       goto retry;
 }
 
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        struct request *rq;
 
-       if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-               return NULL;
-
        BUG_ON(rw != READ && rw != WRITE);
 
+       /* create ioc upfront */
+       create_io_context(gfp_mask, q->node);
+
        spin_lock_irq(q->queue_lock);
-       if (gfp_mask & __GFP_WAIT) {
-               rq = get_request_wait(q, rw, NULL);
-       } else {
-               rq = get_request(q, rw, NULL, gfp_mask);
-               if (!rq)
-                       spin_unlock_irq(q->queue_lock);
-       }
+       rq = get_request(q, rw, NULL, gfp_mask);
+       if (!rq)
+               spin_unlock_irq(q->queue_lock);
        /* q->queue_lock is unlocked at this point */
 
        return rq;
@@ -944,54 +1224,6 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
        __elv_add_request(q, rq, where);
 }
 
-/**
- * blk_insert_request - insert a special request into a request queue
- * @q:         request queue where request should be inserted
- * @rq:                request to be inserted
- * @at_head:   insert request at head or tail of queue
- * @data:      private data
- *
- * Description:
- *    Many block devices need to execute commands asynchronously, so they don't
- *    block the whole kernel from preemption during request execution.  This is
- *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- *    be scheduled for actual execution by the request queue.
- *
- *    We have the option of inserting the head or the tail of the queue.
- *    Typically we use the tail for new ioctls and so forth.  We use the head
- *    of the queue for things like a QUEUE_FULL message from a device, or a
- *    host that is unable to accept a particular command.
- */
-void blk_insert_request(struct request_queue *q, struct request *rq,
-                       int at_head, void *data)
-{
-       int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-       unsigned long flags;
-
-       /*
-        * tell I/O scheduler that this isn't a regular read/write (ie it
-        * must not attempt merges on this) and that it acts as a soft
-        * barrier
-        */
-       rq->cmd_type = REQ_TYPE_SPECIAL;
-
-       rq->special = data;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-
-       /*
-        * If command is tagged, release the tag
-        */
-       if (blk_rq_tagged(rq))
-               blk_queue_end_tag(q, rq);
-
-       add_acct_request(q, rq, where);
-       __blk_run_queue(q);
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_insert_request);
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
 {
@@ -1052,14 +1284,15 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         * it didn't come out of our reserved rq pools
         */
        if (req->cmd_flags & REQ_ALLOCED) {
-               int is_sync = rq_is_sync(req) != 0;
-               int priv = req->cmd_flags & REQ_ELVPRIV;
+               unsigned int flags = req->cmd_flags;
+               struct request_list *rl = blk_rq_rl(req);
 
                BUG_ON(!list_empty(&req->queuelist));
                BUG_ON(!hlist_unhashed(&req->hash));
 
-               blk_free_request(q, req);
-               freed_request(q, is_sync, priv);
+               blk_free_request(rl, req);
+               freed_request(rl, flags);
+               blk_put_rl(rl);
        }
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1126,7 +1359,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
        drive_stat_acct(req, 0);
-       elv_bio_merged(q, req, bio);
        return true;
 }
 
@@ -1157,22 +1389,34 @@ static bool bio_attempt_front_merge(struct request_queue *q,
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
        drive_stat_acct(req, 0);
-       elv_bio_merged(q, req, bio);
        return true;
 }
 
-/*
- * Attempts to merge with the plugged list in the current process. Returns
- * true if merge was successful, otherwise false.
+/**
+ * attempt_plug_merge - try to merge with %current's plugged list
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @request_count: out parameter for number of traversed plugged requests
+ *
+ * Determine whether @bio being queued on @q can be merged with a request
+ * on %current's plugged list.  Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * Plugging coalesces IOs from the same issuer for the same purpose without
+ * going through @q->queue_lock.  As such it's more of an issuing mechanism
+ * than scheduling, and the request, while may have elvpriv data, is not
+ * added on the elevator at this point.  In addition, we don't have
+ * reliable access to the elevator outside queue lock.  Only check basic
+ * merging parameters without querying the elevator.
  */
-static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
-                              struct bio *bio, unsigned int *request_count)
+static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
+                              unsigned int *request_count)
 {
        struct blk_plug *plug;
        struct request *rq;
        bool ret = false;
 
-       plug = tsk->plug;
+       plug = current->plug;
        if (!plug)
                goto out;
        *request_count = 0;
@@ -1180,12 +1424,13 @@ static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
        list_for_each_entry_reverse(rq, &plug->list, queuelist) {
                int el_ret;
 
-               (*request_count)++;
+               if (rq->q == q)
+                       (*request_count)++;
 
-               if (rq->q != q)
+               if (rq->q != q || !blk_rq_merge_ok(rq, bio))
                        continue;
 
-               el_ret = elv_try_merge(rq, bio);
+               el_ret = blk_try_merge(rq, bio);
                if (el_ret == ELEVATOR_BACK_MERGE) {
                        ret = bio_attempt_back_merge(q, rq, bio);
                        if (ret)
@@ -1202,7 +1447,6 @@ out:
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
-       req->cpu = bio->bi_comp_cpu;
        req->cmd_type = REQ_TYPE_FS;
 
        req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1215,7 +1459,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        blk_rq_bio_prep(req->q, req, bio);
 }
 
-static int __make_request(struct request_queue *q, struct bio *bio)
+void blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        struct blk_plug *plug;
@@ -1240,20 +1484,22 @@ static int __make_request(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-       if (attempt_plug_merge(current, q, bio, &request_count))
-               goto out;
+       if (attempt_plug_merge(q, bio, &request_count))
+               return;
 
        spin_lock_irq(q->queue_lock);
 
        el_ret = elv_merge(q, &req, bio);
        if (el_ret == ELEVATOR_BACK_MERGE) {
                if (bio_attempt_back_merge(q, req, bio)) {
+                       elv_bio_merged(q, req, bio);
                        if (!attempt_back_merge(q, req))
                                elv_merged_request(q, req, el_ret);
                        goto out_unlock;
                }
        } else if (el_ret == ELEVATOR_FRONT_MERGE) {
                if (bio_attempt_front_merge(q, req, bio)) {
+                       elv_bio_merged(q, req, bio);
                        if (!attempt_front_merge(q, req))
                                elv_merged_request(q, req, el_ret);
                        goto out_unlock;
@@ -1274,7 +1520,11 @@ get_rq:
         * Grab a free request. This is might sleep but can not fail.
         * Returns with the queue unlocked.
         */
-       req = get_request_wait(q, rw_flags, bio);
+       req = get_request(q, rw_flags, bio, GFP_NOIO);
+       if (unlikely(!req)) {
+               bio_endio(bio, -ENODEV);        /* @q is dead */
+               goto out_unlock;
+       }
 
        /*
         * After dropping the lock and possibly sleeping here, our request
@@ -1284,8 +1534,7 @@ get_rq:
         */
        init_request_from_bio(req, bio);
 
-       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-           bio_flagged(bio, BIO_CPU_AFFINE))
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
                req->cpu = raw_smp_processor_id();
 
        plug = current->plug;
@@ -1298,15 +1547,19 @@ get_rq:
                 */
                if (list_empty(&plug->list))
                        trace_block_plug(q);
-               else if (!plug->should_sort) {
-                       struct request *__rq;
+               else {
+                       if (!plug->should_sort) {
+                               struct request *__rq;
 
-                       __rq = list_entry_rq(plug->list.prev);
-                       if (__rq->q != q)
-                               plug->should_sort = 1;
+                               __rq = list_entry_rq(plug->list.prev);
+                               if (__rq->q != q)
+                                       plug->should_sort = 1;
+                       }
+                       if (request_count >= BLK_MAX_REQUEST_COUNT) {
+                               blk_flush_plug_list(plug, false);
+                               trace_block_plug(q);
+                       }
                }
-               if (request_count >= BLK_MAX_REQUEST_COUNT)
-                       blk_flush_plug_list(plug, false);
                list_add_tail(&req->queuelist, &plug->list);
                drive_stat_acct(req, 1);
        } else {
@@ -1316,9 +1569,8 @@ get_rq:
 out_unlock:
                spin_unlock_irq(q->queue_lock);
        }
-out:
-       return 0;
 }
+EXPORT_SYMBOL_GPL(blk_queue_bio);      /* for device mapper only */
 
 /*
  * If bio->bi_dev is a partition, remap the location
@@ -1417,165 +1669,147 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
        return 0;
 }
 
-/**
- * generic_make_request - hand a buffer to its device driver for I/O
- * @bio:  The bio describing the location in memory and on the device.
- *
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status.  The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may change bi_dev and
- * bi_sector for remaps as it sees fit.  So the values of these fields
- * should NOT be depended on after the call to generic_make_request.
- */
-static inline void __generic_make_request(struct bio *bio)
+static noinline_for_stack bool
+generic_make_request_checks(struct bio *bio)
 {
        struct request_queue *q;
-       sector_t old_sector;
-       int ret, nr_sectors = bio_sectors(bio);
-       dev_t old_dev;
+       int nr_sectors = bio_sectors(bio);
        int err = -EIO;
+       char b[BDEVNAME_SIZE];
+       struct hd_struct *part;
 
        might_sleep();
 
        if (bio_check_eod(bio, nr_sectors))
                goto end_io;
 
-       /*
-        * Resolve the mapping until finished. (drivers are
-        * still free to implement/resolve their own stacking
-        * by explicitly returning 0)
-        *
-        * NOTE: we don't repeat the blk_size check for each new device.
-        * Stacking drivers are expected to know what they are doing.
-        */
-       old_sector = -1;
-       old_dev = 0;
-       do {
-               char b[BDEVNAME_SIZE];
-               struct hd_struct *part;
-
-               q = bdev_get_queue(bio->bi_bdev);
-               if (unlikely(!q)) {
-                       printk(KERN_ERR
-                              "generic_make_request: Trying to access "
-                               "nonexistent block-device %s (%Lu)\n",
-                               bdevname(bio->bi_bdev, b),
-                               (long long) bio->bi_sector);
-                       goto end_io;
-               }
-
-               if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
-                            nr_sectors > queue_max_hw_sectors(q))) {
-                       printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-                              bdevname(bio->bi_bdev, b),
-                              bio_sectors(bio),
-                              queue_max_hw_sectors(q));
-                       goto end_io;
-               }
-
-               if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
-                       goto end_io;
-
-               part = bio->bi_bdev->bd_part;
-               if (should_fail_request(part, bio->bi_size) ||
-                   should_fail_request(&part_to_disk(part)->part0,
-                                       bio->bi_size))
-                       goto end_io;
-
-               /*
-                * If this device has partitions, remap block n
-                * of partition p to block n+start(p) of the disk.
-                */
-               blk_partition_remap(bio);
+       q = bdev_get_queue(bio->bi_bdev);
+       if (unlikely(!q)) {
+               printk(KERN_ERR
+                      "generic_make_request: Trying to access "
+                       "nonexistent block-device %s (%Lu)\n",
+                       bdevname(bio->bi_bdev, b),
+                       (long long) bio->bi_sector);
+               goto end_io;
+       }
 
-               if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
-                       goto end_io;
+       if (likely(bio_is_rw(bio) &&
+                  nr_sectors > queue_max_hw_sectors(q))) {
+               printk(KERN_ERR "bio too big device %s (%u > %u)\n",
+                      bdevname(bio->bi_bdev, b),
+                      bio_sectors(bio),
+                      queue_max_hw_sectors(q));
+               goto end_io;
+       }
 
-               if (old_sector != -1)
-                       trace_block_bio_remap(q, bio, old_dev, old_sector);
+       part = bio->bi_bdev->bd_part;
+       if (should_fail_request(part, bio->bi_size) ||
+           should_fail_request(&part_to_disk(part)->part0,
+                               bio->bi_size))
+               goto end_io;
 
-               old_sector = bio->bi_sector;
-               old_dev = bio->bi_bdev->bd_dev;
+       /*
+        * If this device has partitions, remap block n
+        * of partition p to block n+start(p) of the disk.
+        */
+       blk_partition_remap(bio);
 
-               if (bio_check_eod(bio, nr_sectors))
-                       goto end_io;
+       if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+               goto end_io;
 
-               /*
-                * Filter flush bio's early so that make_request based
-                * drivers without flush support don't have to worry
-                * about them.
-                */
-               if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
-                       bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
-                       if (!nr_sectors) {
-                               err = 0;
-                               goto end_io;
-                       }
-               }
+       if (bio_check_eod(bio, nr_sectors))
+               goto end_io;
 
-               if ((bio->bi_rw & REQ_DISCARD) &&
-                   (!blk_queue_discard(q) ||
-                    ((bio->bi_rw & REQ_SECURE) &&
-                     !blk_queue_secdiscard(q)))) {
-                       err = -EOPNOTSUPP;
+       /*
+        * Filter flush bio's early so that make_request based
+        * drivers without flush support don't have to worry
+        * about them.
+        */
+       if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+               bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+               if (!nr_sectors) {
+                       err = 0;
                        goto end_io;
                }
+       }
 
-               if (blk_throtl_bio(q, &bio))
-                       goto end_io;
+       if ((bio->bi_rw & REQ_DISCARD) &&
+           (!blk_queue_discard(q) ||
+            ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
 
-               /*
-                * If bio = NULL, bio has been throttled and will be submitted
-                * later.
-                */
-               if (!bio)
-                       break;
+       if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
 
-               trace_block_bio_queue(q, bio);
+       /*
+        * Various block parts want %current->io_context and lazy ioc
+        * allocation ends up trading a lot of pain for a small amount of
+        * memory.  Just allocate it upfront.  This may fail and block
+        * layer knows how to live with it.
+        */
+       create_io_context(GFP_ATOMIC, q->node);
 
-               ret = q->make_request_fn(q, bio);
-       } while (ret);
+       if (blk_throtl_bio(q, bio))
+               return false;   /* throttled, will be resubmitted later */
 
-       return;
+       trace_block_bio_queue(q, bio);
+       return true;
 
 end_io:
        bio_endio(bio, err);
+       return false;
 }
 
-/*
- * We only want one ->make_request_fn to be active at a time,
- * else stack usage with stacked devices could be a problem.
- * So use current->bio_list to keep a list of requests
- * submited by a make_request_fn function.
- * current->bio_list is also used as a flag to say if
- * generic_make_request is currently active in this task or not.
- * If it is NULL, then no make_request is active.  If it is non-NULL,
- * then a make_request is active, and new requests should be added
- * at the tail
+/**
+ * generic_make_request - hand a buffer to its device driver for I/O
+ * @bio:  The bio describing the location in memory and on the device.
+ *
+ * generic_make_request() is used to make I/O requests of block
+ * devices. It is passed a &struct bio, which describes the I/O that needs
+ * to be done.
+ *
+ * generic_make_request() does not return any status.  The
+ * success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the bio->bi_end_io
+ * function described (one day) else where.
+ *
+ * The caller of generic_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffer, and that bi_dev and bi_sector are
+ * set to describe the device address, and the
+ * bi_end_io and optionally bi_private are set to describe how
+ * completion notification should be signaled.
+ *
+ * generic_make_request and the drivers it calls may use bi_next if this
+ * bio happens to be merged with someone else, and may resubmit the bio to
+ * a lower device by calling into generic_make_request recursively, which
+ * means the bio should NOT be touched after the call to ->make_request_fn.
  */
 void generic_make_request(struct bio *bio)
 {
        struct bio_list bio_list_on_stack;
 
+       if (!generic_make_request_checks(bio))
+               return;
+
+       /*
+        * We only want one ->make_request_fn to be active at a time, else
+        * stack usage with stacked devices could be a problem.  So use
+        * current->bio_list to keep a list of requests submited by a
+        * make_request_fn function.  current->bio_list is also used as a
+        * flag to say if generic_make_request is currently active in this
+        * task or not.  If it is NULL, then no make_request is active.  If
+        * it is non-NULL, then a make_request is active, and new requests
+        * should be added at the tail
+        */
        if (current->bio_list) {
-               /* make_request is active */
                bio_list_add(current->bio_list, bio);
                return;
        }
+
        /* following loop may be a bit non-obvious, and so deserves some
         * explanation.
         * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1583,22 +1817,21 @@ void generic_make_request(struct bio *bio)
         * We pretend that we have just taken it off a longer list, so
         * we assign bio_list to a pointer to the bio_list_on_stack,
         * thus initialising the bio_list of new bios to be
-        * added.  __generic_make_request may indeed add some more bios
+        * added.  ->make_request() may indeed add some more bios
         * through a recursive call to generic_make_request.  If it
         * did, we find a non-NULL value in bio_list and re-enter the loop
         * from the top.  In this case we really did just take the bio
         * of the top of the list (no pretending) and so remove it from
-        * bio_list, and call into __generic_make_request again.
-        *
-        * The loop was structured like this to make only one call to
-        * __generic_make_request (which is important as it is large and
-        * inlined) and to keep the structure simple.
+        * bio_list, and call into ->make_request() again.
         */
        BUG_ON(bio->bi_next);
        bio_list_init(&bio_list_on_stack);
        current->bio_list = &bio_list_on_stack;
        do {
-               __generic_make_request(bio);
+               struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+               q->make_request_fn(q, bio);
+
                bio = bio_list_pop(current->bio_list);
        } while (bio);
        current->bio_list = NULL; /* deactivate */
@@ -1617,15 +1850,20 @@ EXPORT_SYMBOL(generic_make_request);
  */
 void submit_bio(int rw, struct bio *bio)
 {
-       int count = bio_sectors(bio);
-
        bio->bi_rw |= rw;
 
        /*
         * If it's a regular read/write or a barrier with data attached,
         * go through the normal accounting stuff before submission.
         */
-       if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
+       if (bio_has_data(bio)) {
+               unsigned int count;
+
+               if (unlikely(rw & REQ_WRITE_SAME))
+                       count = bdev_logical_block_size(bio->bi_bdev) >> 9;
+               else
+                       count = bio_sectors(bio);
+
                if (rw & WRITE) {
                        count_vm_events(PGPGOUT, count);
                } else {
@@ -1671,11 +1909,10 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-       if (rq->cmd_flags & REQ_DISCARD)
+       if (!rq_mergeable(rq))
                return 0;
 
-       if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
-           blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
+       if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
                printk(KERN_ERR "%s: over max size limit.\n", __func__);
                return -EIO;
        }
@@ -1714,6 +1951,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
                return -EIO;
 
        spin_lock_irqsave(q->queue_lock, flags);
+       if (unlikely(blk_queue_dying(q))) {
+               spin_unlock_irqrestore(q->queue_lock, flags);
+               return -ENODEV;
+       }
 
        /*
         * Submitting request must be dequeued before calling this function
@@ -2055,9 +2296,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
                        error_type = "I/O";
                        break;
                }
-               printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
-                      error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
-                      (unsigned long long)blk_rq_pos(req));
+               printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+                                  error_type, req->rq_disk ?
+                                  req->rq_disk->disk_name : "?",
+                                  (unsigned long long)blk_rq_pos(req));
+
        }
 
        blk_account_io_completion(req, nr_bytes);
@@ -2141,7 +2384,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        req->buffer = bio_data(req->bio);
 
        /* update sector only for requests with clear definition of sector */
-       if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
+       if (req->cmd_type == REQ_TYPE_FS)
                req->__sector += total_bytes >> 9;
 
        /* mixed attributes always follow the first bio */
@@ -2582,16 +2825,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
        blk_rq_init(NULL, rq);
 
        __rq_for_each_bio(bio_src, rq_src) {
-               bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+               bio = bio_clone_bioset(bio_src, gfp_mask, bs);
                if (!bio)
                        goto free_and_out;
 
-               __bio_clone(bio, bio_src);
-
-               if (bio_integrity(bio_src) &&
-                   bio_integrity_clone(bio, bio_src, gfp_mask, bs))
-                       goto free_and_out;
-
                if (bio_ctr && bio_ctr(bio, bio_src, data))
                        goto free_and_out;
 
@@ -2608,7 +2845,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 
 free_and_out:
        if (bio)
-               bio_free(bio, bs);
+               bio_put(bio);
        blk_rq_unprep_clone(rq);
 
        return -ENOMEM;
@@ -2630,6 +2867,20 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
 #define PLUG_MAGIC     0x91827364
 
+/**
+ * blk_start_plug - initialize blk_plug and track it inside the task_struct
+ * @plug:      The &struct blk_plug that needs to be initialized
+ *
+ * Description:
+ *   Tracking blk_plug inside the task_struct will help with auto-flushing the
+ *   pending I/O should the task end up blocking between blk_start_plug() and
+ *   blk_finish_plug(). This is important from a performance perspective, but
+ *   also ensures that we don't deadlock. For instance, if the task is blocking
+ *   for a memory allocation, memory reclaim could end up wanting to free a
+ *   page belonging to that request that is currently residing in our private
+ *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
+ *   this kind of deadlock.
+ */
 void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
@@ -2658,7 +2909,8 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);
 
-       return !(rqa->q <= rqb->q);
+       return !(rqa->q < rqb->q ||
+               (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
 }
 
 /*
@@ -2673,38 +2925,54 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 {
        trace_block_unplug(q, depth, !from_schedule);
 
-       /*
-        * If we are punting this to kblockd, then we can safely drop
-        * the queue_lock before waking kblockd (which needs to take
-        * this lock).
-        */
-       if (from_schedule) {
-               spin_unlock(q->queue_lock);
+       if (from_schedule)
                blk_run_queue_async(q);
-       } else {
+       else
                __blk_run_queue(q);
-               spin_unlock(q->queue_lock);
-       }
-
+       spin_unlock(q->queue_lock);
 }
 
-static void flush_plug_callbacks(struct blk_plug *plug)
+static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
 {
        LIST_HEAD(callbacks);
 
-       if (list_empty(&plug->cb_list))
-               return;
-
-       list_splice_init(&plug->cb_list, &callbacks);
+       while (!list_empty(&plug->cb_list)) {
+               list_splice_init(&plug->cb_list, &callbacks);
 
-       while (!list_empty(&callbacks)) {
-               struct blk_plug_cb *cb = list_first_entry(&callbacks,
+               while (!list_empty(&callbacks)) {
+                       struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
-               list_del(&cb->list);
-               cb->callback(cb);
+                       list_del(&cb->list);
+                       cb->callback(cb, from_schedule);
+               }
+       }
+}
+
+struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
+                                     int size)
+{
+       struct blk_plug *plug = current->plug;
+       struct blk_plug_cb *cb;
+
+       if (!plug)
+               return NULL;
+
+       list_for_each_entry(cb, &plug->cb_list, list)
+               if (cb->callback == unplug && cb->data == data)
+                       return cb;
+
+       /* Not currently on the callback list */
+       BUG_ON(size < sizeof(*cb));
+       cb = kzalloc(size, GFP_ATOMIC);
+       if (cb) {
+               cb->data = data;
+               cb->callback = unplug;
+               list_add(&cb->list, &plug->cb_list);
        }
+       return cb;
 }
+EXPORT_SYMBOL(blk_check_plugged);
 
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
@@ -2716,7 +2984,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
        BUG_ON(plug->magic != PLUG_MAGIC);
 
-       flush_plug_callbacks(plug);
+       flush_plug_callbacks(plug, from_schedule);
        if (list_empty(&plug->list))
                return;
 
@@ -2749,6 +3017,15 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                        depth = 0;
                        spin_lock(q->queue_lock);
                }
+
+               /*
+                * Short-circuit if @q is dead
+                */
+               if (unlikely(blk_queue_dying(q))) {
+                       __blk_end_request_all(rq, -ENODEV);
+                       continue;
+               }
+
                /*
                 * rq is already accounted, so use raw insert
                 */