block: change force plug flush call order

[linux-3.10.git] / block / blk-core.c
diff --git a/block/blk-core.c b/block/blk-core.c

index 7e9715ae18c809526864ee1b5272d1dfe6acf451..67dba6941194d5a7b697c8bad2ccb89f11263682 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,19 +198,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
  }
  EXPORT_SYMBOL(blk_dump_rq_flags);
  
-/*
- * Make sure that plugs that were pending when this function was entered,
- * are now complete and requests pushed to the queue.
-*/
-static inline void queue_sync_plugs(struct request_queue *q)
-{
-       /*
-        * If the current process is plugged and has barriers submitted,
-        * we will livelock if we don't unplug first.
-        */
-       blk_flush_plug(current);
-}
-
  static void blk_delay_work(struct work_struct *work)
  {
         struct request_queue *q;
@@ -233,7 +220,8 @@ static void blk_delay_work(struct work_struct *work)
   */
  void blk_delay_queue(struct request_queue *q, unsigned long msecs)
  {
-       schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
+       queue_delayed_work(kblockd_workqueue, &q->delay_work,
+                               msecs_to_jiffies(msecs));
  }
  EXPORT_SYMBOL(blk_delay_queue);
  
@@ -271,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
   **/
  void blk_stop_queue(struct request_queue *q)
  {
-       cancel_delayed_work(&q->delay_work);
+       __cancel_delayed_work(&q->delay_work);
         queue_flag_set(QUEUE_FLAG_STOPPED, q);
  }
  EXPORT_SYMBOL(blk_stop_queue);
@@ -289,13 +277,15 @@ EXPORT_SYMBOL(blk_stop_queue);
   *     that its ->make_request_fn will not re-add plugging prior to calling
   *     this function.
   *
+ *     This function does not cancel any asynchronous activity arising
+ *     out of elevator or throttling code. That would require elevaotor_exit()
+ *     and blk_throtl_exit() to be called with queue lock initialized.
+ *
   */
  void blk_sync_queue(struct request_queue *q)
  {
         del_timer_sync(&q->timeout);
-       throtl_shutdown_timer_wq(q);
         cancel_delayed_work_sync(&q->delay_work);
-       queue_sync_plugs(q);
  }
  EXPORT_SYMBOL(blk_sync_queue);
  
@@ -306,25 +296,33 @@ EXPORT_SYMBOL(blk_sync_queue);
   * Description:
   *    See @blk_run_queue. This variant must be called with the queue lock
   *    held and interrupts disabled.
- *
   */
  void __blk_run_queue(struct request_queue *q)
  {
         if (unlikely(blk_queue_stopped(q)))
                 return;
  
-       /*
-        * Only recurse once to avoid overrunning the stack, let the unplug
-        * handling reinvoke the handler shortly if we already got there.
-        */
-       if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-               q->request_fn(q);
-               queue_flag_clear(QUEUE_FLAG_REENTER, q);
-       } else
-               queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+       q->request_fn(q);
  }
  EXPORT_SYMBOL(__blk_run_queue);
  
+/**
+ * blk_run_queue_async - run a single device queue in workqueue context
+ * @q: The queue to run
+ *
+ * Description:
+ *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
+ *    of us.
+ */
+void blk_run_queue_async(struct request_queue *q)
+{
+       if (likely(!blk_queue_stopped(q))) {
+               __cancel_delayed_work(&q->delay_work);
+               queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
+       }
+}
+EXPORT_SYMBOL(blk_run_queue_async);
+
  /**
   * blk_run_queue - run a single device queue
   * @q: The queue to run
@@ -347,7 +345,13 @@ void blk_put_queue(struct request_queue *q)
  {
         kobject_put(&q->kobj);
  }
+EXPORT_SYMBOL(blk_put_queue);
  
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
  void blk_cleanup_queue(struct request_queue *q)
  {
         /*
@@ -366,6 +370,8 @@ void blk_cleanup_queue(struct request_queue *q)
         if (q->elevator)
                 elevator_exit(q->elevator);
  
+       blk_throtl_exit(q);
+
         blk_put_queue(q);
  }
  EXPORT_SYMBOL(blk_cleanup_queue);
@@ -439,6 +445,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
         mutex_init(&q->sysfs_lock);
         spin_lock_init(&q->__queue_lock);
  
+       /*
+        * By default initialize queue_lock to internal lock and driver can
+        * override it later if need be.
+        */
+       q->queue_lock = &q->__queue_lock;
+
         return q;
  }
  EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -522,7 +534,10 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
         q->prep_rq_fn           = NULL;
         q->unprep_rq_fn         = NULL;
         q->queue_flags          = QUEUE_FLAG_DEFAULT;
-       q->queue_lock           = lock;
+
+       /* Override internal queue lock with supplied lock pointer */
+       if (lock)
+               q->queue_lock           = lock;
  
         /*
          * This also sets hw/phys segments, boundary and size
@@ -552,11 +567,10 @@ int blk_get_queue(struct request_queue *q)
  
         return 1;
  }
+EXPORT_SYMBOL(blk_get_queue);
  
  static inline void blk_free_request(struct request_queue *q, struct request *rq)
  {
-       BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
-
         if (rq->cmd_flags & REQ_ELVPRIV)
                 elv_put_request(q, rq);
         mempool_free(rq, q->rq.rq_pool);
@@ -825,6 +839,9 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
  {
         struct request *rq;
  
+       if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+               return NULL;
+
         BUG_ON(rw != READ && rw != WRITE);
  
         spin_lock_irq(q->queue_lock);
@@ -1096,14 +1113,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
  {
         const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
  
-       /*
-        * Debug stuff, kill later
-        */
-       if (!rq_mergeable(req)) {
-               blk_dump_rq_flags(req, "back");
-               return false;
-       }
-
         if (!ll_back_merge_fn(q, req, bio))
                 return false;
  
@@ -1118,6 +1127,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
         req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
  
         drive_stat_acct(req, 0);
+       elv_bio_merged(q, req, bio);
         return true;
  }
  
@@ -1125,15 +1135,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
                                     struct request *req, struct bio *bio)
  {
         const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
-       sector_t sector;
-
-       /*
-        * Debug stuff, kill later
-        */
-       if (!rq_mergeable(req)) {
-               blk_dump_rq_flags(req, "front");
-               return false;
-       }
  
         if (!ll_front_merge_fn(q, req, bio))
                 return false;
@@ -1143,8 +1144,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
         if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                 blk_rq_set_mixed_merge(req);
  
-       sector = bio->bi_sector;
-
         bio->bi_next = req->bio;
         req->bio = bio;
  
@@ -1159,12 +1158,13 @@ static bool bio_attempt_front_merge(struct request_queue *q,
         req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
  
         drive_stat_acct(req, 0);
+       elv_bio_merged(q, req, bio);
         return true;
  }
  
  /*
   * Attempts to merge with the plugged list in the current process. Returns
- * true if merge was succesful, otherwise false.
+ * true if merge was successful, otherwise false.
   */
  static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
                                struct bio *bio)
@@ -1244,14 +1244,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
  
         el_ret = elv_merge(q, &req, bio);
         if (el_ret == ELEVATOR_BACK_MERGE) {
-               BUG_ON(req->cmd_flags & REQ_ON_PLUG);
                 if (bio_attempt_back_merge(q, req, bio)) {
                         if (!attempt_back_merge(q, req))
                                 elv_merged_request(q, req, el_ret);
                         goto out_unlock;
                 }
         } else if (el_ret == ELEVATOR_FRONT_MERGE) {
-               BUG_ON(req->cmd_flags & REQ_ON_PLUG);
                 if (bio_attempt_front_merge(q, req, bio)) {
                         if (!attempt_front_merge(q, req))
                                 elv_merged_request(q, req, el_ret);
@@ -1284,24 +1282,29 @@ get_rq:
         init_request_from_bio(req, bio);
  
         if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-           bio_flagged(bio, BIO_CPU_AFFINE)) {
-               req->cpu = blk_cpu_to_group(get_cpu());
-               put_cpu();
-       }
+           bio_flagged(bio, BIO_CPU_AFFINE))
+               req->cpu = raw_smp_processor_id();
  
         plug = current->plug;
         if (plug) {
-               if (!plug->should_sort && !list_empty(&plug->list)) {
+               /*
+                * If this is the first request added after a plug, fire
+                * of a plug trace. If others have been added before, check
+                * if we have multiple devices in this plug. If so, make a
+                * note to sort the list before dispatch.
+                */
+               if (list_empty(&plug->list))
+                       trace_block_plug(q);
+               else if (!plug->should_sort) {
                         struct request *__rq;
  
                         __rq = list_entry_rq(plug->list.prev);
                         if (__rq->q != q)
                                 plug->should_sort = 1;
                 }
-               /*
-                * Debug flag, kill later
-                */
-               req->cmd_flags |= REQ_ON_PLUG;
+               if (plug->count >= BLK_MAX_REQUEST_COUNT)
+                       blk_flush_plug_list(plug, false);
+               plug->count++;
                 list_add_tail(&req->queuelist, &plug->list);
                 drive_stat_acct(req, 1);
         } else {
@@ -1358,29 +1361,27 @@ static int __init setup_fail_make_request(char *str)
  }
  __setup("fail_make_request=", setup_fail_make_request);
  
-static int should_fail_request(struct bio *bio)
+static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
  {
-       struct hd_struct *part = bio->bi_bdev->bd_part;
-
-       if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
-               return should_fail(&fail_make_request, bio->bi_size);
-
-       return 0;
+       return part->make_it_fail && should_fail(&fail_make_request, bytes);
  }
  
  static int __init fail_make_request_debugfs(void)
  {
-       return init_fault_attr_dentries(&fail_make_request,
-                                       "fail_make_request");
+       struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
+                                               NULL, &fail_make_request);
+
+       return IS_ERR(dir) ? PTR_ERR(dir) : 0;
  }
  
  late_initcall(fail_make_request_debugfs);
  
  #else /* CONFIG_FAIL_MAKE_REQUEST */
  
-static inline int should_fail_request(struct bio *bio)
+static inline bool should_fail_request(struct hd_struct *part,
+                                       unsigned int bytes)
  {
-       return 0;
+       return false;
  }
  
  #endif /* CONFIG_FAIL_MAKE_REQUEST */
@@ -1463,6 +1464,7 @@ static inline void __generic_make_request(struct bio *bio)
         old_dev = 0;
         do {
                 char b[BDEVNAME_SIZE];
+               struct hd_struct *part;
  
                 q = bdev_get_queue(bio->bi_bdev);
                 if (unlikely(!q)) {
@@ -1486,7 +1488,10 @@ static inline void __generic_make_request(struct bio *bio)
                 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
                         goto end_io;
  
-               if (should_fail_request(bio))
+               part = bio->bi_bdev->bd_part;
+               if (should_fail_request(part, bio->bi_size) ||
+                   should_fail_request(&part_to_disk(part)->part0,
+                                       bio->bi_size))
                         goto end_io;
  
                 /*
@@ -1528,7 +1533,8 @@ static inline void __generic_make_request(struct bio *bio)
                         goto end_io;
                 }
  
-               blk_throtl_bio(q, &bio);
+               if (blk_throtl_bio(q, &bio))
+                       goto end_io;
  
                 /*
                  * If bio = NULL, bio has been throttled and will be submitted
@@ -1696,15 +1702,14 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
  int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
  {
         unsigned long flags;
+       int where = ELEVATOR_INSERT_BACK;
  
         if (blk_rq_check_limits(q, rq))
                 return -EIO;
  
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-       if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
-           should_fail(&fail_make_request, blk_rq_bytes(rq)))
+       if (rq->rq_disk &&
+           should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
                 return -EIO;
-#endif
  
         spin_lock_irqsave(q->queue_lock, flags);
  
@@ -1714,7 +1719,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
          */
         BUG_ON(blk_queued_rq(rq));
  
-       add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+       if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+               where = ELEVATOR_INSERT_FLUSH;
+
+       add_acct_request(q, rq, where);
         spin_unlock_irqrestore(q->queue_lock, flags);
  
         return 0;
@@ -2026,9 +2034,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
  
         if (error && req->cmd_type == REQ_TYPE_FS &&
             !(req->cmd_flags & REQ_QUIET)) {
-               printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
-                               req->rq_disk ? req->rq_disk->disk_name : "?",
-                               (unsigned long long)blk_rq_pos(req));
+               char *error_type;
+
+               switch (error) {
+               case -ENOLINK:
+                       error_type = "recoverable transport";
+                       break;
+               case -EREMOTEIO:
+                       error_type = "critical target";
+                       break;
+               case -EBADE:
+                       error_type = "critical nexus";
+                       break;
+               case -EIO:
+               default:
+                       error_type = "I/O";
+                       break;
+               }
+               printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+                      error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
+                      (unsigned long long)blk_rq_pos(req));
         }
  
         blk_account_io_completion(req, nr_bytes);
@@ -2126,7 +2151,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
          * size, something has gone terribly wrong.
          */
         if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
-               printk(KERN_ERR "blk: request botched\n");
+               blk_dump_rq_flags(req, "request botched");
                 req->__data_len = blk_rq_cur_bytes(req);
         }
  
@@ -2254,7 +2279,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
   *     %false - we are done with this request
   *     %true  - still buffers pending for this request
   **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+bool __blk_end_bidi_request(struct request *rq, int error,
                                    unsigned int nr_bytes, unsigned int bidi_bytes)
  {
         if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@ -2607,7 +2632,9 @@ void blk_start_plug(struct blk_plug *plug)
  
         plug->magic = PLUG_MAGIC;
         INIT_LIST_HEAD(&plug->list);
+       INIT_LIST_HEAD(&plug->cb_list);
         plug->should_sort = 0;
+       plug->count = 0;
  
         /*
          * If this is a nested plug, don't actually assign it. It will be
@@ -2628,76 +2655,126 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
         struct request *rqa = container_of(a, struct request, queuelist);
         struct request *rqb = container_of(b, struct request, queuelist);
  
-       return !(rqa->q == rqb->q);
+       return !(rqa->q <= rqb->q);
+}
+
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
+static void queue_unplugged(struct request_queue *q, unsigned int depth,
+                           bool from_schedule)
+       __releases(q->queue_lock)
+{
+       trace_block_unplug(q, depth, !from_schedule);
+
+       /*
+        * If we are punting this to kblockd, then we can safely drop
+        * the queue_lock before waking kblockd (which needs to take
+        * this lock).
+        */
+       if (from_schedule) {
+               spin_unlock(q->queue_lock);
+               blk_run_queue_async(q);
+       } else {
+               __blk_run_queue(q);
+               spin_unlock(q->queue_lock);
+       }
+
+}
+
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+       LIST_HEAD(callbacks);
+
+       if (list_empty(&plug->cb_list))
+               return;
+
+       list_splice_init(&plug->cb_list, &callbacks);
+
+       while (!list_empty(&callbacks)) {
+               struct blk_plug_cb *cb = list_first_entry(&callbacks,
+                                                         struct blk_plug_cb,
+                                                         list);
+               list_del(&cb->list);
+               cb->callback(cb);
+       }
  }
  
-static void flush_plug_list(struct blk_plug *plug)
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  {
         struct request_queue *q;
         unsigned long flags;
         struct request *rq;
+       LIST_HEAD(list);
+       unsigned int depth;
  
         BUG_ON(plug->magic != PLUG_MAGIC);
  
+       flush_plug_callbacks(plug);
         if (list_empty(&plug->list))
                 return;
  
-       if (plug->should_sort)
-               list_sort(NULL, &plug->list, plug_rq_cmp);
+       list_splice_init(&plug->list, &list);
+       plug->count = 0;
+
+       if (plug->should_sort) {
+               list_sort(NULL, &list, plug_rq_cmp);
+               plug->should_sort = 0;
+       }
  
         q = NULL;
+       depth = 0;
+
+       /*
+        * Save and disable interrupts here, to avoid doing it for every
+        * queue lock we have to take.
+        */
         local_irq_save(flags);
-       while (!list_empty(&plug->list)) {
-               rq = list_entry_rq(plug->list.next);
+       while (!list_empty(&list)) {
+               rq = list_entry_rq(list.next);
                 list_del_init(&rq->queuelist);
-               BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
                 BUG_ON(!rq->q);
                 if (rq->q != q) {
-                       if (q) {
-                               __blk_run_queue(q);
-                               spin_unlock(q->queue_lock);
-                       }
+                       /*
+                        * This drops the queue lock
+                        */
+                       if (q)
+                               queue_unplugged(q, depth, from_schedule);
                         q = rq->q;
+                       depth = 0;
                         spin_lock(q->queue_lock);
                 }
-               rq->cmd_flags &= ~REQ_ON_PLUG;
-
                 /*
                  * rq is already accounted, so use raw insert
                  */
-               __elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
-       }
+               if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
+                       __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
+               else
+                       __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
  
-       if (q) {
-               __blk_run_queue(q);
-               spin_unlock(q->queue_lock);
+               depth++;
         }
  
-       BUG_ON(!list_empty(&plug->list));
-       local_irq_restore(flags);
-}
-
-static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
-       flush_plug_list(plug);
+       /*
+        * This drops the queue lock
+        */
+       if (q)
+               queue_unplugged(q, depth, from_schedule);
  
-       if (plug == tsk->plug)
-               tsk->plug = NULL;
+       local_irq_restore(flags);
  }
  
  void blk_finish_plug(struct blk_plug *plug)
  {
-       if (plug)
-               __blk_finish_plug(current, plug);
-}
-EXPORT_SYMBOL(blk_finish_plug);
+       blk_flush_plug_list(plug, false);
  
-void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
-       __blk_finish_plug(tsk, plug);
-       tsk->plug = plug;
+       if (plug == current->plug)
+               current->plug = NULL;
  }
-EXPORT_SYMBOL(__blk_flush_plug);
+EXPORT_SYMBOL(blk_finish_plug);
  
  int __init blk_dev_init(void)
  {