blktrace: support per-partition tracing
[linux-2.6.git] / kernel / trace / blktrace.c
1 /*
2  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16  *
17  */
18 #include <linux/kernel.h>
19 #include <linux/blkdev.h>
20 #include <linux/blktrace_api.h>
21 #include <linux/percpu.h>
22 #include <linux/init.h>
23 #include <linux/mutex.h>
24 #include <linux/debugfs.h>
25 #include <linux/time.h>
26 #include <trace/block.h>
27 #include <linux/uaccess.h>
28 #include "trace_output.h"
29
30 static unsigned int blktrace_seq __read_mostly = 1;
31
32 static struct trace_array *blk_tr;
33 static bool blk_tracer_enabled __read_mostly;
34
35 /* Select an alternative, minimalistic output than the original one */
36 #define TRACE_BLK_OPT_CLASSIC   0x1
37
38 static struct tracer_opt blk_tracer_opts[] = {
39         /* Default disable the minimalistic output */
40         { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
41         { }
42 };
43
44 static struct tracer_flags blk_tracer_flags = {
45         .val  = 0,
46         .opts = blk_tracer_opts,
47 };
48
49 /* Global reference count of probes */
50 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
51
52 static void blk_register_tracepoints(void);
53 static void blk_unregister_tracepoints(void);
54
55 /*
56  * Send out a notify message.
57  */
58 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
59                        const void *data, size_t len)
60 {
61         struct blk_io_trace *t;
62         struct ring_buffer_event *event = NULL;
63         int pc = 0;
64         int cpu = smp_processor_id();
65         bool blk_tracer = blk_tracer_enabled;
66
67         if (blk_tracer) {
68                 pc = preempt_count();
69                 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
70                                                   sizeof(*t) + len,
71                                                   0, pc);
72                 if (!event)
73                         return;
74                 t = ring_buffer_event_data(event);
75                 goto record_it;
76         }
77
78         if (!bt->rchan)
79                 return;
80
81         t = relay_reserve(bt->rchan, sizeof(*t) + len);
82         if (t) {
83                 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
84                 t->time = ktime_to_ns(ktime_get());
85 record_it:
86                 t->device = bt->dev;
87                 t->action = action;
88                 t->pid = pid;
89                 t->cpu = cpu;
90                 t->pdu_len = len;
91                 memcpy((void *) t + sizeof(*t), data, len);
92
93                 if (blk_tracer)
94                         trace_buffer_unlock_commit(blk_tr, event, 0, pc);
95         }
96 }
97
98 /*
99  * Send out a notify for this process, if we haven't done so since a trace
100  * started
101  */
102 static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
103 {
104         tsk->btrace_seq = blktrace_seq;
105         trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
106 }
107
108 static void trace_note_time(struct blk_trace *bt)
109 {
110         struct timespec now;
111         unsigned long flags;
112         u32 words[2];
113
114         getnstimeofday(&now);
115         words[0] = now.tv_sec;
116         words[1] = now.tv_nsec;
117
118         local_irq_save(flags);
119         trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
120         local_irq_restore(flags);
121 }
122
123 void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
124 {
125         int n;
126         va_list args;
127         unsigned long flags;
128         char *buf;
129
130         if (unlikely(bt->trace_state != Blktrace_running &&
131                      !blk_tracer_enabled))
132                 return;
133
134         local_irq_save(flags);
135         buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
136         va_start(args, fmt);
137         n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
138         va_end(args);
139
140         trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
141         local_irq_restore(flags);
142 }
143 EXPORT_SYMBOL_GPL(__trace_note_message);
144
145 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
146                          pid_t pid)
147 {
148         if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149                 return 1;
150         if (sector && (sector < bt->start_lba || sector > bt->end_lba))
151                 return 1;
152         if (bt->pid && pid != bt->pid)
153                 return 1;
154
155         return 0;
156 }
157
158 /*
159  * Data direction bit lookup
160  */
161 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
162                                  BLK_TC_ACT(BLK_TC_WRITE) };
163
164 /* The ilog2() calls fall out because they're constant */
165 #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
166           (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
167
168 /*
169  * The worker for the various blk_add_trace*() types. Fills out a
170  * blk_io_trace structure and places it in a per-cpu subbuffer.
171  */
172 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
173                      int rw, u32 what, int error, int pdu_len, void *pdu_data)
174 {
175         struct task_struct *tsk = current;
176         struct ring_buffer_event *event = NULL;
177         struct blk_io_trace *t;
178         unsigned long flags = 0;
179         unsigned long *sequence;
180         pid_t pid;
181         int cpu, pc = 0;
182         bool blk_tracer = blk_tracer_enabled;
183
184         if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
185                 return;
186
187         what |= ddir_act[rw & WRITE];
188         what |= MASK_TC_BIT(rw, BARRIER);
189         what |= MASK_TC_BIT(rw, SYNCIO);
190         what |= MASK_TC_BIT(rw, AHEAD);
191         what |= MASK_TC_BIT(rw, META);
192         what |= MASK_TC_BIT(rw, DISCARD);
193
194         pid = tsk->pid;
195         if (act_log_check(bt, what, sector, pid))
196                 return;
197         cpu = raw_smp_processor_id();
198
199         if (blk_tracer) {
200                 tracing_record_cmdline(current);
201
202                 pc = preempt_count();
203                 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
204                                                   sizeof(*t) + pdu_len,
205                                                   0, pc);
206                 if (!event)
207                         return;
208                 t = ring_buffer_event_data(event);
209                 goto record_it;
210         }
211
212         /*
213          * A word about the locking here - we disable interrupts to reserve
214          * some space in the relay per-cpu buffer, to prevent an irq
215          * from coming in and stepping on our toes.
216          */
217         local_irq_save(flags);
218
219         if (unlikely(tsk->btrace_seq != blktrace_seq))
220                 trace_note_tsk(bt, tsk);
221
222         t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
223         if (t) {
224                 sequence = per_cpu_ptr(bt->sequence, cpu);
225
226                 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
227                 t->sequence = ++(*sequence);
228                 t->time = ktime_to_ns(ktime_get());
229 record_it:
230                 /*
231                  * These two are not needed in ftrace as they are in the
232                  * generic trace_entry, filled by tracing_generic_entry_update,
233                  * but for the trace_event->bin() synthesizer benefit we do it
234                  * here too.
235                  */
236                 t->cpu = cpu;
237                 t->pid = pid;
238
239                 t->sector = sector;
240                 t->bytes = bytes;
241                 t->action = what;
242                 t->device = bt->dev;
243                 t->error = error;
244                 t->pdu_len = pdu_len;
245
246                 if (pdu_len)
247                         memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
248
249                 if (blk_tracer) {
250                         trace_buffer_unlock_commit(blk_tr, event, 0, pc);
251                         return;
252                 }
253         }
254
255         local_irq_restore(flags);
256 }
257
258 static struct dentry *blk_tree_root;
259 static DEFINE_MUTEX(blk_tree_mutex);
260
261 static void blk_trace_free(struct blk_trace *bt)
262 {
263         debugfs_remove(bt->msg_file);
264         debugfs_remove(bt->dropped_file);
265         relay_close(bt->rchan);
266         free_percpu(bt->sequence);
267         free_percpu(bt->msg_data);
268         kfree(bt);
269 }
270
271 static void blk_trace_cleanup(struct blk_trace *bt)
272 {
273         blk_trace_free(bt);
274         if (atomic_dec_and_test(&blk_probes_ref))
275                 blk_unregister_tracepoints();
276 }
277
278 int blk_trace_remove(struct request_queue *q)
279 {
280         struct blk_trace *bt;
281
282         bt = xchg(&q->blk_trace, NULL);
283         if (!bt)
284                 return -EINVAL;
285
286         if (bt->trace_state != Blktrace_running)
287                 blk_trace_cleanup(bt);
288
289         return 0;
290 }
291 EXPORT_SYMBOL_GPL(blk_trace_remove);
292
293 static int blk_dropped_open(struct inode *inode, struct file *filp)
294 {
295         filp->private_data = inode->i_private;
296
297         return 0;
298 }
299
300 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
301                                 size_t count, loff_t *ppos)
302 {
303         struct blk_trace *bt = filp->private_data;
304         char buf[16];
305
306         snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
307
308         return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
309 }
310
311 static const struct file_operations blk_dropped_fops = {
312         .owner =        THIS_MODULE,
313         .open =         blk_dropped_open,
314         .read =         blk_dropped_read,
315 };
316
317 static int blk_msg_open(struct inode *inode, struct file *filp)
318 {
319         filp->private_data = inode->i_private;
320
321         return 0;
322 }
323
324 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
325                                 size_t count, loff_t *ppos)
326 {
327         char *msg;
328         struct blk_trace *bt;
329
330         if (count >= BLK_TN_MAX_MSG)
331                 return -EINVAL;
332
333         msg = kmalloc(count + 1, GFP_KERNEL);
334         if (msg == NULL)
335                 return -ENOMEM;
336
337         if (copy_from_user(msg, buffer, count)) {
338                 kfree(msg);
339                 return -EFAULT;
340         }
341
342         msg[count] = '\0';
343         bt = filp->private_data;
344         __trace_note_message(bt, "%s", msg);
345         kfree(msg);
346
347         return count;
348 }
349
350 static const struct file_operations blk_msg_fops = {
351         .owner =        THIS_MODULE,
352         .open =         blk_msg_open,
353         .write =        blk_msg_write,
354 };
355
356 /*
357  * Keep track of how many times we encountered a full subbuffer, to aid
358  * the user space app in telling how many lost events there were.
359  */
360 static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
361                                      void *prev_subbuf, size_t prev_padding)
362 {
363         struct blk_trace *bt;
364
365         if (!relay_buf_full(buf))
366                 return 1;
367
368         bt = buf->chan->private_data;
369         atomic_inc(&bt->dropped);
370         return 0;
371 }
372
373 static int blk_remove_buf_file_callback(struct dentry *dentry)
374 {
375         struct dentry *parent = dentry->d_parent;
376         debugfs_remove(dentry);
377
378         /*
379         * this will fail for all but the last file, but that is ok. what we
380         * care about is the top level buts->name directory going away, when
381         * the last trace file is gone. Then we don't have to rmdir() that
382         * manually on trace stop, so it nicely solves the issue with
383         * force killing of running traces.
384         */
385
386         debugfs_remove(parent);
387         return 0;
388 }
389
390 static struct dentry *blk_create_buf_file_callback(const char *filename,
391                                                    struct dentry *parent,
392                                                    int mode,
393                                                    struct rchan_buf *buf,
394                                                    int *is_global)
395 {
396         return debugfs_create_file(filename, mode, parent, buf,
397                                         &relay_file_operations);
398 }
399
400 static struct rchan_callbacks blk_relay_callbacks = {
401         .subbuf_start           = blk_subbuf_start_callback,
402         .create_buf_file        = blk_create_buf_file_callback,
403         .remove_buf_file        = blk_remove_buf_file_callback,
404 };
405
406 /*
407  * Setup everything required to start tracing
408  */
409 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410                        struct block_device *bdev,
411                        struct blk_user_trace_setup *buts)
412 {
413         struct blk_trace *old_bt, *bt = NULL;
414         struct dentry *dir = NULL;
415         int ret, i;
416         struct hd_struct *part = NULL;
417
418         if (!buts->buf_size || !buts->buf_nr)
419                 return -EINVAL;
420
421         strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
422         buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
423
424         /*
425          * some device names have larger paths - convert the slashes
426          * to underscores for this to work as expected
427          */
428         for (i = 0; i < strlen(buts->name); i++)
429                 if (buts->name[i] == '/')
430                         buts->name[i] = '_';
431
432         bt = kzalloc(sizeof(*bt), GFP_KERNEL);
433         if (!bt)
434                 return -ENOMEM;
435
436         ret = -ENOMEM;
437         bt->sequence = alloc_percpu(unsigned long);
438         if (!bt->sequence)
439                 goto err;
440
441         bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
442         if (!bt->msg_data)
443                 goto err;
444
445         ret = -ENOENT;
446
447         mutex_lock(&blk_tree_mutex);
448         if (!blk_tree_root) {
449                 blk_tree_root = debugfs_create_dir("block", NULL);
450                 if (!blk_tree_root) {
451                         mutex_unlock(&blk_tree_mutex);
452                         goto err;
453                 }
454         }
455         mutex_unlock(&blk_tree_mutex);
456
457         dir = debugfs_create_dir(buts->name, blk_tree_root);
458
459         if (!dir)
460                 goto err;
461
462         bt->dir = dir;
463         bt->dev = dev;
464         atomic_set(&bt->dropped, 0);
465
466         ret = -EIO;
467         bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
468                                                &blk_dropped_fops);
469         if (!bt->dropped_file)
470                 goto err;
471
472         bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
473         if (!bt->msg_file)
474                 goto err;
475
476         bt->rchan = relay_open("trace", dir, buts->buf_size,
477                                 buts->buf_nr, &blk_relay_callbacks, bt);
478         if (!bt->rchan)
479                 goto err;
480
481         bt->act_mask = buts->act_mask;
482         if (!bt->act_mask)
483                 bt->act_mask = (u16) -1;
484
485         if (bdev)
486                 part = bdev->bd_part;
487
488         if (part) {
489                 bt->start_lba = part->start_sect;
490                 bt->end_lba = part->start_sect + part->nr_sects;
491         } else
492                 bt->end_lba = -1ULL;
493
494         /* overwrite with user settings */
495         if (buts->start_lba)
496                 bt->start_lba = buts->start_lba;
497         if (buts->end_lba)
498                 bt->end_lba = buts->end_lba;
499
500         bt->pid = buts->pid;
501         bt->trace_state = Blktrace_setup;
502
503         ret = -EBUSY;
504         old_bt = xchg(&q->blk_trace, bt);
505         if (old_bt) {
506                 (void) xchg(&q->blk_trace, old_bt);
507                 goto err;
508         }
509
510         if (atomic_inc_return(&blk_probes_ref) == 1)
511                 blk_register_tracepoints();
512
513         return 0;
514 err:
515         blk_trace_free(bt);
516         return ret;
517 }
518
519 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
520                     struct block_device *bdev,
521                     char __user *arg)
522 {
523         struct blk_user_trace_setup buts;
524         int ret;
525
526         ret = copy_from_user(&buts, arg, sizeof(buts));
527         if (ret)
528                 return -EFAULT;
529
530         ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
531         if (ret)
532                 return ret;
533
534         if (copy_to_user(arg, &buts, sizeof(buts)))
535                 return -EFAULT;
536
537         return 0;
538 }
539 EXPORT_SYMBOL_GPL(blk_trace_setup);
540
541 int blk_trace_startstop(struct request_queue *q, int start)
542 {
543         int ret;
544         struct blk_trace *bt = q->blk_trace;
545
546         if (bt == NULL)
547                 return -EINVAL;
548
549         /*
550          * For starting a trace, we can transition from a setup or stopped
551          * trace. For stopping a trace, the state must be running
552          */
553         ret = -EINVAL;
554         if (start) {
555                 if (bt->trace_state == Blktrace_setup ||
556                     bt->trace_state == Blktrace_stopped) {
557                         blktrace_seq++;
558                         smp_mb();
559                         bt->trace_state = Blktrace_running;
560
561                         trace_note_time(bt);
562                         ret = 0;
563                 }
564         } else {
565                 if (bt->trace_state == Blktrace_running) {
566                         bt->trace_state = Blktrace_stopped;
567                         relay_flush(bt->rchan);
568                         ret = 0;
569                 }
570         }
571
572         return ret;
573 }
574 EXPORT_SYMBOL_GPL(blk_trace_startstop);
575
576 /**
577  * blk_trace_ioctl: - handle the ioctls associated with tracing
578  * @bdev:       the block device
579  * @cmd:        the ioctl cmd
580  * @arg:        the argument data, if any
581  *
582  **/
583 int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
584 {
585         struct request_queue *q;
586         int ret, start = 0;
587         char b[BDEVNAME_SIZE];
588
589         q = bdev_get_queue(bdev);
590         if (!q)
591                 return -ENXIO;
592
593         mutex_lock(&bdev->bd_mutex);
594
595         switch (cmd) {
596         case BLKTRACESETUP:
597                 bdevname(bdev, b);
598                 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
599                 break;
600         case BLKTRACESTART:
601                 start = 1;
602         case BLKTRACESTOP:
603                 ret = blk_trace_startstop(q, start);
604                 break;
605         case BLKTRACETEARDOWN:
606                 ret = blk_trace_remove(q);
607                 break;
608         default:
609                 ret = -ENOTTY;
610                 break;
611         }
612
613         mutex_unlock(&bdev->bd_mutex);
614         return ret;
615 }
616
617 /**
618  * blk_trace_shutdown: - stop and cleanup trace structures
619  * @q:    the request queue associated with the device
620  *
621  **/
622 void blk_trace_shutdown(struct request_queue *q)
623 {
624         if (q->blk_trace) {
625                 blk_trace_startstop(q, 0);
626                 blk_trace_remove(q);
627         }
628 }
629
630 /*
631  * blktrace probes
632  */
633
634 /**
635  * blk_add_trace_rq - Add a trace for a request oriented action
636  * @q:          queue the io is for
637  * @rq:         the source request
638  * @what:       the action
639  *
640  * Description:
641  *     Records an action against a request. Will log the bio offset + size.
642  *
643  **/
644 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
645                                     u32 what)
646 {
647         struct blk_trace *bt = q->blk_trace;
648         int rw = rq->cmd_flags & 0x03;
649
650         if (likely(!bt))
651                 return;
652
653         if (blk_discard_rq(rq))
654                 rw |= (1 << BIO_RW_DISCARD);
655
656         if (blk_pc_request(rq)) {
657                 what |= BLK_TC_ACT(BLK_TC_PC);
658                 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
659                                 rq->cmd_len, rq->cmd);
660         } else  {
661                 what |= BLK_TC_ACT(BLK_TC_FS);
662                 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
663                                 rw, what, rq->errors, 0, NULL);
664         }
665 }
666
667 static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
668 {
669         blk_add_trace_rq(q, rq, BLK_TA_ABORT);
670 }
671
672 static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
673 {
674         blk_add_trace_rq(q, rq, BLK_TA_INSERT);
675 }
676
677 static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
678 {
679         blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
680 }
681
682 static void blk_add_trace_rq_requeue(struct request_queue *q,
683                                      struct request *rq)
684 {
685         blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
686 }
687
688 static void blk_add_trace_rq_complete(struct request_queue *q,
689                                       struct request *rq)
690 {
691         blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
692 }
693
694 /**
695  * blk_add_trace_bio - Add a trace for a bio oriented action
696  * @q:          queue the io is for
697  * @bio:        the source bio
698  * @what:       the action
699  *
700  * Description:
701  *     Records an action against a bio. Will log the bio offset + size.
702  *
703  **/
704 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
705                                      u32 what)
706 {
707         struct blk_trace *bt = q->blk_trace;
708
709         if (likely(!bt))
710                 return;
711
712         __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
713                         !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
714 }
715
716 static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
717 {
718         blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
719 }
720
721 static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
722 {
723         blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
724 }
725
726 static void blk_add_trace_bio_backmerge(struct request_queue *q,
727                                         struct bio *bio)
728 {
729         blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
730 }
731
732 static void blk_add_trace_bio_frontmerge(struct request_queue *q,
733                                          struct bio *bio)
734 {
735         blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
736 }
737
738 static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
739 {
740         blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
741 }
742
743 static void blk_add_trace_getrq(struct request_queue *q,
744                                 struct bio *bio, int rw)
745 {
746         if (bio)
747                 blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
748         else {
749                 struct blk_trace *bt = q->blk_trace;
750
751                 if (bt)
752                         __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
753         }
754 }
755
756
757 static void blk_add_trace_sleeprq(struct request_queue *q,
758                                   struct bio *bio, int rw)
759 {
760         if (bio)
761                 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
762         else {
763                 struct blk_trace *bt = q->blk_trace;
764
765                 if (bt)
766                         __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
767                                         0, 0, NULL);
768         }
769 }
770
771 static void blk_add_trace_plug(struct request_queue *q)
772 {
773         struct blk_trace *bt = q->blk_trace;
774
775         if (bt)
776                 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
777 }
778
779 static void blk_add_trace_unplug_io(struct request_queue *q)
780 {
781         struct blk_trace *bt = q->blk_trace;
782
783         if (bt) {
784                 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
785                 __be64 rpdu = cpu_to_be64(pdu);
786
787                 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
788                                 sizeof(rpdu), &rpdu);
789         }
790 }
791
792 static void blk_add_trace_unplug_timer(struct request_queue *q)
793 {
794         struct blk_trace *bt = q->blk_trace;
795
796         if (bt) {
797                 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
798                 __be64 rpdu = cpu_to_be64(pdu);
799
800                 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
801                                 sizeof(rpdu), &rpdu);
802         }
803 }
804
805 static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
806                                 unsigned int pdu)
807 {
808         struct blk_trace *bt = q->blk_trace;
809
810         if (bt) {
811                 __be64 rpdu = cpu_to_be64(pdu);
812
813                 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
814                                 BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
815                                 sizeof(rpdu), &rpdu);
816         }
817 }
818
819 /**
820  * blk_add_trace_remap - Add a trace for a remap operation
821  * @q:          queue the io is for
822  * @bio:        the source bio
823  * @dev:        target device
824  * @from:       source sector
825  * @to:         target sector
826  *
827  * Description:
828  *     Device mapper or raid target sometimes need to split a bio because
829  *     it spans a stripe (or similar). Add a trace for that action.
830  *
831  **/
832 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
833                                        dev_t dev, sector_t from, sector_t to)
834 {
835         struct blk_trace *bt = q->blk_trace;
836         struct blk_io_trace_remap r;
837
838         if (likely(!bt))
839                 return;
840
841         r.device = cpu_to_be32(dev);
842         r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
843         r.sector = cpu_to_be64(to);
844
845         __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
846                         !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
847 }
848
849 /**
850  * blk_add_driver_data - Add binary message with driver-specific data
851  * @q:          queue the io is for
852  * @rq:         io request
853  * @data:       driver-specific data
854  * @len:        length of driver-specific data
855  *
856  * Description:
857  *     Some drivers might want to write driver-specific data per request.
858  *
859  **/
860 void blk_add_driver_data(struct request_queue *q,
861                          struct request *rq,
862                          void *data, size_t len)
863 {
864         struct blk_trace *bt = q->blk_trace;
865
866         if (likely(!bt))
867                 return;
868
869         if (blk_pc_request(rq))
870                 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
871                                 rq->errors, len, data);
872         else
873                 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
874                                 0, BLK_TA_DRV_DATA, rq->errors, len, data);
875 }
876 EXPORT_SYMBOL_GPL(blk_add_driver_data);
877
878 static void blk_register_tracepoints(void)
879 {
880         int ret;
881
882         ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
883         WARN_ON(ret);
884         ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
885         WARN_ON(ret);
886         ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
887         WARN_ON(ret);
888         ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
889         WARN_ON(ret);
890         ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
891         WARN_ON(ret);
892         ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
893         WARN_ON(ret);
894         ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
895         WARN_ON(ret);
896         ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
897         WARN_ON(ret);
898         ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
899         WARN_ON(ret);
900         ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
901         WARN_ON(ret);
902         ret = register_trace_block_getrq(blk_add_trace_getrq);
903         WARN_ON(ret);
904         ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
905         WARN_ON(ret);
906         ret = register_trace_block_plug(blk_add_trace_plug);
907         WARN_ON(ret);
908         ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
909         WARN_ON(ret);
910         ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
911         WARN_ON(ret);
912         ret = register_trace_block_split(blk_add_trace_split);
913         WARN_ON(ret);
914         ret = register_trace_block_remap(blk_add_trace_remap);
915         WARN_ON(ret);
916 }
917
918 static void blk_unregister_tracepoints(void)
919 {
920         unregister_trace_block_remap(blk_add_trace_remap);
921         unregister_trace_block_split(blk_add_trace_split);
922         unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
923         unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
924         unregister_trace_block_plug(blk_add_trace_plug);
925         unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
926         unregister_trace_block_getrq(blk_add_trace_getrq);
927         unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
928         unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
929         unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
930         unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
931         unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
932         unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
933         unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
934         unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
935         unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
936         unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
937
938         tracepoint_synchronize_unregister();
939 }
940
941 /*
942  * struct blk_io_tracer formatting routines
943  */
944
945 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
946 {
947         int i = 0;
948         int tc = t->action >> BLK_TC_SHIFT;
949
950         if (t->action == BLK_TN_MESSAGE) {
951                 rwbs[i++] = 'N';
952                 goto out;
953         }
954
955         if (tc & BLK_TC_DISCARD)
956                 rwbs[i++] = 'D';
957         else if (tc & BLK_TC_WRITE)
958                 rwbs[i++] = 'W';
959         else if (t->bytes)
960                 rwbs[i++] = 'R';
961         else
962                 rwbs[i++] = 'N';
963
964         if (tc & BLK_TC_AHEAD)
965                 rwbs[i++] = 'A';
966         if (tc & BLK_TC_BARRIER)
967                 rwbs[i++] = 'B';
968         if (tc & BLK_TC_SYNC)
969                 rwbs[i++] = 'S';
970         if (tc & BLK_TC_META)
971                 rwbs[i++] = 'M';
972 out:
973         rwbs[i] = '\0';
974 }
975
976 static inline
977 const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
978 {
979         return (const struct blk_io_trace *)ent;
980 }
981
982 static inline const void *pdu_start(const struct trace_entry *ent)
983 {
984         return te_blk_io_trace(ent) + 1;
985 }
986
987 static inline u32 t_action(const struct trace_entry *ent)
988 {
989         return te_blk_io_trace(ent)->action;
990 }
991
992 static inline u32 t_bytes(const struct trace_entry *ent)
993 {
994         return te_blk_io_trace(ent)->bytes;
995 }
996
997 static inline u32 t_sec(const struct trace_entry *ent)
998 {
999         return te_blk_io_trace(ent)->bytes >> 9;
1000 }
1001
1002 static inline unsigned long long t_sector(const struct trace_entry *ent)
1003 {
1004         return te_blk_io_trace(ent)->sector;
1005 }
1006
1007 static inline __u16 t_error(const struct trace_entry *ent)
1008 {
1009         return te_blk_io_trace(ent)->error;
1010 }
1011
1012 static __u64 get_pdu_int(const struct trace_entry *ent)
1013 {
1014         const __u64 *val = pdu_start(ent);
1015         return be64_to_cpu(*val);
1016 }
1017
1018 static void get_pdu_remap(const struct trace_entry *ent,
1019                           struct blk_io_trace_remap *r)
1020 {
1021         const struct blk_io_trace_remap *__r = pdu_start(ent);
1022         __u64 sector = __r->sector;
1023
1024         r->device = be32_to_cpu(__r->device);
1025         r->device_from = be32_to_cpu(__r->device_from);
1026         r->sector = be64_to_cpu(sector);
1027 }
1028
1029 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1030
1031 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1032 {
1033         char rwbs[6];
1034         unsigned long long ts  = iter->ts;
1035         unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1036         unsigned secs          = (unsigned long)ts;
1037         const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1038
1039         fill_rwbs(rwbs, t);
1040
1041         return trace_seq_printf(&iter->seq,
1042                                 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1043                                 MAJOR(t->device), MINOR(t->device), iter->cpu,
1044                                 secs, nsec_rem, iter->ent->pid, act, rwbs);
1045 }
1046
1047 static int blk_log_action(struct trace_iterator *iter, const char *act)
1048 {
1049         char rwbs[6];
1050         const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1051
1052         fill_rwbs(rwbs, t);
1053         return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1054                                 MAJOR(t->device), MINOR(t->device), act, rwbs);
1055 }
1056
1057 static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1058 {
1059         const char *pdu_buf;
1060         int pdu_len;
1061         int i, end, ret;
1062
1063         pdu_buf = pdu_start(ent);
1064         pdu_len = te_blk_io_trace(ent)->pdu_len;
1065
1066         if (!pdu_len)
1067                 return 1;
1068
1069         /* find the last zero that needs to be printed */
1070         for (end = pdu_len - 1; end >= 0; end--)
1071                 if (pdu_buf[end])
1072                         break;
1073         end++;
1074
1075         if (!trace_seq_putc(s, '('))
1076                 return 0;
1077
1078         for (i = 0; i < pdu_len; i++) {
1079
1080                 ret = trace_seq_printf(s, "%s%02x",
1081                                        i == 0 ? "" : " ", pdu_buf[i]);
1082                 if (!ret)
1083                         return ret;
1084
1085                 /*
1086                  * stop when the rest is just zeroes and indicate so
1087                  * with a ".." appended
1088                  */
1089                 if (i == end && end != pdu_len - 1)
1090                         return trace_seq_puts(s, " ..) ");
1091         }
1092
1093         return trace_seq_puts(s, ") ");
1094 }
1095
1096 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1097 {
1098         char cmd[TASK_COMM_LEN];
1099
1100         trace_find_cmdline(ent->pid, cmd);
1101
1102         if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1103                 int ret;
1104
1105                 ret = trace_seq_printf(s, "%u ", t_bytes(ent));
1106                 if (!ret)
1107                         return 0;
1108                 ret = blk_log_dump_pdu(s, ent);
1109                 if (!ret)
1110                         return 0;
1111                 return trace_seq_printf(s, "[%s]\n", cmd);
1112         } else {
1113                 if (t_sec(ent))
1114                         return trace_seq_printf(s, "%llu + %u [%s]\n",
1115                                                 t_sector(ent), t_sec(ent), cmd);
1116                 return trace_seq_printf(s, "[%s]\n", cmd);
1117         }
1118 }
1119
1120 static int blk_log_with_error(struct trace_seq *s,
1121                               const struct trace_entry *ent)
1122 {
1123         if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1124                 int ret;
1125
1126                 ret = blk_log_dump_pdu(s, ent);
1127                 if (ret)
1128                         return trace_seq_printf(s, "[%d]\n", t_error(ent));
1129                 return 0;
1130         } else {
1131                 if (t_sec(ent))
1132                         return trace_seq_printf(s, "%llu + %u [%d]\n",
1133                                                 t_sector(ent),
1134                                                 t_sec(ent), t_error(ent));
1135                 return trace_seq_printf(s, "%llu [%d]\n",
1136                                         t_sector(ent), t_error(ent));
1137         }
1138 }
1139
1140 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1141 {
1142         struct blk_io_trace_remap r = { .device = 0, };
1143
1144         get_pdu_remap(ent, &r);
1145         return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1146                                t_sector(ent),
1147                                t_sec(ent), MAJOR(r.device), MINOR(r.device),
1148                                (unsigned long long)r.sector);
1149 }
1150
1151 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1152 {
1153         char cmd[TASK_COMM_LEN];
1154
1155         trace_find_cmdline(ent->pid, cmd);
1156
1157         return trace_seq_printf(s, "[%s]\n", cmd);
1158 }
1159
1160 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1161 {
1162         char cmd[TASK_COMM_LEN];
1163
1164         trace_find_cmdline(ent->pid, cmd);
1165
1166         return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1167 }
1168
1169 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1170 {
1171         char cmd[TASK_COMM_LEN];
1172
1173         trace_find_cmdline(ent->pid, cmd);
1174
1175         return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1176                                 get_pdu_int(ent), cmd);
1177 }
1178
1179 static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1180 {
1181         int ret;
1182         const struct blk_io_trace *t = te_blk_io_trace(ent);
1183
1184         ret = trace_seq_putmem(s, t + 1, t->pdu_len);
1185         if (ret)
1186                 return trace_seq_putc(s, '\n');
1187         return ret;
1188 }
1189
1190 /*
1191  * struct tracer operations
1192  */
1193
1194 static void blk_tracer_print_header(struct seq_file *m)
1195 {
1196         if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1197                 return;
1198         seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
1199                     "#  |     |     |           |   |   |\n");
1200 }
1201
1202 static void blk_tracer_start(struct trace_array *tr)
1203 {
1204         blk_tracer_enabled = true;
1205         trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1206 }
1207
1208 static int blk_tracer_init(struct trace_array *tr)
1209 {
1210         blk_tr = tr;
1211         blk_tracer_start(tr);
1212         return 0;
1213 }
1214
1215 static void blk_tracer_stop(struct trace_array *tr)
1216 {
1217         blk_tracer_enabled = false;
1218         trace_flags |= TRACE_ITER_CONTEXT_INFO;
1219 }
1220
1221 static void blk_tracer_reset(struct trace_array *tr)
1222 {
1223         blk_tracer_stop(tr);
1224 }
1225
1226 static const struct {
1227         const char *act[2];
1228         int        (*print)(struct trace_seq *s, const struct trace_entry *ent);
1229 } what2act[] = {
1230         [__BLK_TA_QUEUE]        = {{  "Q", "queue" },      blk_log_generic },
1231         [__BLK_TA_BACKMERGE]    = {{  "M", "backmerge" },  blk_log_generic },
1232         [__BLK_TA_FRONTMERGE]   = {{  "F", "frontmerge" }, blk_log_generic },
1233         [__BLK_TA_GETRQ]        = {{  "G", "getrq" },      blk_log_generic },
1234         [__BLK_TA_SLEEPRQ]      = {{  "S", "sleeprq" },    blk_log_generic },
1235         [__BLK_TA_REQUEUE]      = {{  "R", "requeue" },    blk_log_with_error },
1236         [__BLK_TA_ISSUE]        = {{  "D", "issue" },      blk_log_generic },
1237         [__BLK_TA_COMPLETE]     = {{  "C", "complete" },   blk_log_with_error },
1238         [__BLK_TA_PLUG]         = {{  "P", "plug" },       blk_log_plug },
1239         [__BLK_TA_UNPLUG_IO]    = {{  "U", "unplug_io" },  blk_log_unplug },
1240         [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1241         [__BLK_TA_INSERT]       = {{  "I", "insert" },     blk_log_generic },
1242         [__BLK_TA_SPLIT]        = {{  "X", "split" },      blk_log_split },
1243         [__BLK_TA_BOUNCE]       = {{  "B", "bounce" },     blk_log_generic },
1244         [__BLK_TA_REMAP]        = {{  "A", "remap" },      blk_log_remap },
1245 };
1246
1247 static enum print_line_t print_one_line(struct trace_iterator *iter,
1248                                         bool classic)
1249 {
1250         struct trace_seq *s = &iter->seq;
1251         const struct blk_io_trace *t;
1252         u16 what;
1253         int ret;
1254         bool long_act;
1255         blk_log_action_t *log_action;
1256
1257         t          = te_blk_io_trace(iter->ent);
1258         what       = t->action & ((1 << BLK_TC_SHIFT) - 1);
1259         long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
1260         log_action = classic ? &blk_log_action_classic : &blk_log_action;
1261
1262         if (t->action == BLK_TN_MESSAGE) {
1263                 ret = log_action(iter, long_act ? "message" : "m");
1264                 if (ret)
1265                         ret = blk_log_msg(s, iter->ent);
1266                 goto out;
1267         }
1268
1269         if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1270                 ret = trace_seq_printf(s, "Unknown action %x\n", what);
1271         else {
1272                 ret = log_action(iter, what2act[what].act[long_act]);
1273                 if (ret)
1274                         ret = what2act[what].print(s, iter->ent);
1275         }
1276 out:
1277         return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1278 }
1279
1280 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1281                                                int flags)
1282 {
1283         if (!trace_print_context(iter))
1284                 return TRACE_TYPE_PARTIAL_LINE;
1285
1286         return print_one_line(iter, false);
1287 }
1288
1289 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1290 {
1291         struct trace_seq *s = &iter->seq;
1292         struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1293         const int offset = offsetof(struct blk_io_trace, sector);
1294         struct blk_io_trace old = {
1295                 .magic    = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1296                 .time     = iter->ts,
1297         };
1298
1299         if (!trace_seq_putmem(s, &old, offset))
1300                 return 0;
1301         return trace_seq_putmem(s, &t->sector,
1302                                 sizeof(old) - offset + t->pdu_len);
1303 }
1304
1305 static enum print_line_t
1306 blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1307 {
1308         return blk_trace_synthesize_old_trace(iter) ?
1309                         TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1310 }
1311
1312 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1313 {
1314         if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1315                 return TRACE_TYPE_UNHANDLED;
1316
1317         return print_one_line(iter, true);
1318 }
1319
1320 static struct tracer blk_tracer __read_mostly = {
1321         .name           = "blk",
1322         .init           = blk_tracer_init,
1323         .reset          = blk_tracer_reset,
1324         .start          = blk_tracer_start,
1325         .stop           = blk_tracer_stop,
1326         .print_header   = blk_tracer_print_header,
1327         .print_line     = blk_tracer_print_line,
1328         .flags          = &blk_tracer_flags,
1329 };
1330
1331 static struct trace_event trace_blk_event = {
1332         .type           = TRACE_BLK,
1333         .trace          = blk_trace_event_print,
1334         .binary         = blk_trace_event_print_binary,
1335 };
1336
1337 static int __init init_blk_tracer(void)
1338 {
1339         if (!register_ftrace_event(&trace_blk_event)) {
1340                 pr_warning("Warning: could not register block events\n");
1341                 return 1;
1342         }
1343
1344         if (register_tracer(&blk_tracer) != 0) {
1345                 pr_warning("Warning: could not register the block tracer\n");
1346                 unregister_ftrace_event(&trace_blk_event);
1347                 return 1;
1348         }
1349
1350         return 0;
1351 }
1352
1353 device_initcall(init_blk_tracer);
1354
1355 static int blk_trace_remove_queue(struct request_queue *q)
1356 {
1357         struct blk_trace *bt;
1358
1359         bt = xchg(&q->blk_trace, NULL);
1360         if (bt == NULL)
1361                 return -EINVAL;
1362
1363         if (atomic_dec_and_test(&blk_probes_ref))
1364                 blk_unregister_tracepoints();
1365
1366         blk_trace_free(bt);
1367         return 0;
1368 }
1369
1370 /*
1371  * Setup everything required to start tracing
1372  */
1373 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1374 {
1375         struct blk_trace *old_bt, *bt = NULL;
1376         int ret = -ENOMEM;
1377
1378         bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1379         if (!bt)
1380                 return -ENOMEM;
1381
1382         bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1383         if (!bt->msg_data)
1384                 goto free_bt;
1385
1386         bt->dev = dev;
1387         bt->act_mask = (u16)-1;
1388         bt->end_lba = -1ULL;
1389
1390         old_bt = xchg(&q->blk_trace, bt);
1391         if (old_bt != NULL) {
1392                 (void)xchg(&q->blk_trace, old_bt);
1393                 ret = -EBUSY;
1394                 goto free_bt;
1395         }
1396
1397         if (atomic_inc_return(&blk_probes_ref) == 1)
1398                 blk_register_tracepoints();
1399         return 0;
1400
1401 free_bt:
1402         blk_trace_free(bt);
1403         return ret;
1404 }
1405
1406 /*
1407  * sysfs interface to enable and configure tracing
1408  */
1409
1410 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1411                                          struct device_attribute *attr,
1412                                          char *buf);
1413 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1414                                           struct device_attribute *attr,
1415                                           const char *buf, size_t count);
1416 #define BLK_TRACE_DEVICE_ATTR(_name) \
1417         DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1418                     sysfs_blk_trace_attr_show, \
1419                     sysfs_blk_trace_attr_store)
1420
1421 static BLK_TRACE_DEVICE_ATTR(enable);
1422 static BLK_TRACE_DEVICE_ATTR(act_mask);
1423 static BLK_TRACE_DEVICE_ATTR(pid);
1424 static BLK_TRACE_DEVICE_ATTR(start_lba);
1425 static BLK_TRACE_DEVICE_ATTR(end_lba);
1426
1427 static struct attribute *blk_trace_attrs[] = {
1428         &dev_attr_enable.attr,
1429         &dev_attr_act_mask.attr,
1430         &dev_attr_pid.attr,
1431         &dev_attr_start_lba.attr,
1432         &dev_attr_end_lba.attr,
1433         NULL
1434 };
1435
1436 struct attribute_group blk_trace_attr_group = {
1437         .name  = "trace",
1438         .attrs = blk_trace_attrs,
1439 };
1440
1441 static const struct {
1442         int mask;
1443         const char *str;
1444 } mask_maps[] = {
1445         { BLK_TC_READ,          "read"          },
1446         { BLK_TC_WRITE,         "write"         },
1447         { BLK_TC_BARRIER,       "barrier"       },
1448         { BLK_TC_SYNC,          "sync"          },
1449         { BLK_TC_QUEUE,         "queue"         },
1450         { BLK_TC_REQUEUE,       "requeue"       },
1451         { BLK_TC_ISSUE,         "issue"         },
1452         { BLK_TC_COMPLETE,      "complete"      },
1453         { BLK_TC_FS,            "fs"            },
1454         { BLK_TC_PC,            "pc"            },
1455         { BLK_TC_AHEAD,         "ahead"         },
1456         { BLK_TC_META,          "meta"          },
1457         { BLK_TC_DISCARD,       "discard"       },
1458         { BLK_TC_DRV_DATA,      "drv_data"      },
1459 };
1460
1461 static int blk_trace_str2mask(const char *str)
1462 {
1463         int i;
1464         int mask = 0;
1465         char *buf, *s, *token;
1466
1467         buf = kstrdup(str, GFP_KERNEL);
1468         if (buf == NULL)
1469                 return -ENOMEM;
1470         s = strstrip(buf);
1471
1472         while (1) {
1473                 token = strsep(&s, ",");
1474                 if (token == NULL)
1475                         break;
1476
1477                 if (*token == '\0')
1478                         continue;
1479
1480                 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1481                         if (strcasecmp(token, mask_maps[i].str) == 0) {
1482                                 mask |= mask_maps[i].mask;
1483                                 break;
1484                         }
1485                 }
1486                 if (i == ARRAY_SIZE(mask_maps)) {
1487                         mask = -EINVAL;
1488                         break;
1489                 }
1490         }
1491         kfree(buf);
1492
1493         return mask;
1494 }
1495
1496 static ssize_t blk_trace_mask2str(char *buf, int mask)
1497 {
1498         int i;
1499         char *p = buf;
1500
1501         for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1502                 if (mask & mask_maps[i].mask) {
1503                         p += sprintf(p, "%s%s",
1504                                     (p == buf) ? "" : ",", mask_maps[i].str);
1505                 }
1506         }
1507         *p++ = '\n';
1508
1509         return p - buf;
1510 }
1511
1512 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
1513 {
1514         if (bdev->bd_disk == NULL)
1515                 return NULL;
1516
1517         return bdev_get_queue(bdev);
1518 }
1519
1520 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1521                                          struct device_attribute *attr,
1522                                          char *buf)
1523 {
1524         struct hd_struct *p = dev_to_part(dev);
1525         struct request_queue *q;
1526         struct block_device *bdev;
1527         ssize_t ret = -ENXIO;
1528
1529         lock_kernel();
1530         bdev = bdget(part_devt(p));
1531         if (bdev == NULL)
1532                 goto out_unlock_kernel;
1533
1534         q = blk_trace_get_queue(bdev);
1535         if (q == NULL)
1536                 goto out_bdput;
1537
1538         mutex_lock(&bdev->bd_mutex);
1539
1540         if (attr == &dev_attr_enable) {
1541                 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1542                 goto out_unlock_bdev;
1543         }
1544
1545         if (q->blk_trace == NULL)
1546                 ret = sprintf(buf, "disabled\n");
1547         else if (attr == &dev_attr_act_mask)
1548                 ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
1549         else if (attr == &dev_attr_pid)
1550                 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1551         else if (attr == &dev_attr_start_lba)
1552                 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1553         else if (attr == &dev_attr_end_lba)
1554                 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1555
1556 out_unlock_bdev:
1557         mutex_unlock(&bdev->bd_mutex);
1558 out_bdput:
1559         bdput(bdev);
1560 out_unlock_kernel:
1561         unlock_kernel();
1562         return ret;
1563 }
1564
1565 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1566                                           struct device_attribute *attr,
1567                                           const char *buf, size_t count)
1568 {
1569         struct block_device *bdev;
1570         struct request_queue *q;
1571         struct hd_struct *p;
1572         u64 value;
1573         ssize_t ret = -EINVAL;
1574
1575         if (count == 0)
1576                 goto out;
1577
1578         if (attr == &dev_attr_act_mask) {
1579                 if (sscanf(buf, "%llx", &value) != 1) {
1580                         /* Assume it is a list of trace category names */
1581                         ret = blk_trace_str2mask(buf);
1582                         if (ret < 0)
1583                                 goto out;
1584                         value = ret;
1585                 }
1586         } else if (sscanf(buf, "%llu", &value) != 1)
1587                 goto out;
1588
1589         ret = -ENXIO;
1590
1591         lock_kernel();
1592         p = dev_to_part(dev);
1593         bdev = bdget(part_devt(p));
1594         if (bdev == NULL)
1595                 goto out_unlock_kernel;
1596
1597         q = blk_trace_get_queue(bdev);
1598         if (q == NULL)
1599                 goto out_bdput;
1600
1601         mutex_lock(&bdev->bd_mutex);
1602
1603         if (attr == &dev_attr_enable) {
1604                 if (value)
1605                         ret = blk_trace_setup_queue(q, bdev->bd_dev);
1606                 else
1607                         ret = blk_trace_remove_queue(q);
1608                 goto out_unlock_bdev;
1609         }
1610
1611         ret = 0;
1612         if (q->blk_trace == NULL)
1613                 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1614
1615         if (ret == 0) {
1616                 if (attr == &dev_attr_act_mask)
1617                         q->blk_trace->act_mask = value;
1618                 else if (attr == &dev_attr_pid)
1619                         q->blk_trace->pid = value;
1620                 else if (attr == &dev_attr_start_lba)
1621                         q->blk_trace->start_lba = value;
1622                 else if (attr == &dev_attr_end_lba)
1623                         q->blk_trace->end_lba = value;
1624         }
1625
1626 out_unlock_bdev:
1627         mutex_unlock(&bdev->bd_mutex);
1628 out_bdput:
1629         bdput(bdev);
1630 out_unlock_kernel:
1631         unlock_kernel();
1632 out:
1633         return ret ? ret : count;
1634 }
1635