718b8f7b6aac005e34e912fea880968d395d899d
[linux-2.6.git] / tools / perf / builtin-record.c
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9
10 #include "perf.h"
11
12 #include "util/util.h"
13 #include "util/parse-options.h"
14 #include "util/parse-events.h"
15 #include "util/string.h"
16
17 #include "util/header.h"
18
19 #include <unistd.h>
20 #include <sched.h>
21
22 #define ALIGN(x, a)             __ALIGN_MASK(x, (typeof(x))(a)-1)
23 #define __ALIGN_MASK(x, mask)   (((x)+(mask))&~(mask))
24
25 static int                      fd[MAX_NR_CPUS][MAX_COUNTERS];
26
27 static long                     default_interval                = 100000;
28
29 static int                      nr_cpus                         = 0;
30 static unsigned int             page_size;
31 static unsigned int             mmap_pages                      = 128;
32 static int                      freq                            = 0;
33 static int                      output;
34 static const char               *output_name                    = "perf.data";
35 static int                      group                           = 0;
36 static unsigned int             realtime_prio                   = 0;
37 static int                      system_wide                     = 0;
38 static pid_t                    target_pid                      = -1;
39 static int                      inherit                         = 1;
40 static int                      force                           = 0;
41 static int                      append_file                     = 0;
42 static int                      call_graph                      = 0;
43 static int                      inherit_stat                    = 0;
44 static int                      no_samples                      = 0;
45 static int                      sample_address                  = 0;
46
47 static long                     samples;
48 static struct timeval           last_read;
49 static struct timeval           this_read;
50
51 static u64                      bytes_written;
52
53 static struct pollfd            event_array[MAX_NR_CPUS * MAX_COUNTERS];
54
55 static int                      nr_poll;
56 static int                      nr_cpu;
57
58 static int                      file_new = 1;
59
60 struct perf_header              *header;
61
62 struct mmap_data {
63         int                     counter;
64         void                    *base;
65         unsigned int            mask;
66         unsigned int            prev;
67 };
68
69 static struct mmap_data         mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
70
71 static unsigned long mmap_read_head(struct mmap_data *md)
72 {
73         struct perf_counter_mmap_page *pc = md->base;
74         long head;
75
76         head = pc->data_head;
77         rmb();
78
79         return head;
80 }
81
82 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
83 {
84         struct perf_counter_mmap_page *pc = md->base;
85
86         /*
87          * ensure all reads are done before we write the tail out.
88          */
89         /* mb(); */
90         pc->data_tail = tail;
91 }
92
93 static void write_output(void *buf, size_t size)
94 {
95         while (size) {
96                 int ret = write(output, buf, size);
97
98                 if (ret < 0)
99                         die("failed to write");
100
101                 size -= ret;
102                 buf += ret;
103
104                 bytes_written += ret;
105         }
106 }
107
108 static void mmap_read(struct mmap_data *md)
109 {
110         unsigned int head = mmap_read_head(md);
111         unsigned int old = md->prev;
112         unsigned char *data = md->base + page_size;
113         unsigned long size;
114         void *buf;
115         int diff;
116
117         gettimeofday(&this_read, NULL);
118
119         /*
120          * If we're further behind than half the buffer, there's a chance
121          * the writer will bite our tail and mess up the samples under us.
122          *
123          * If we somehow ended up ahead of the head, we got messed up.
124          *
125          * In either case, truncate and restart at head.
126          */
127         diff = head - old;
128         if (diff < 0) {
129                 struct timeval iv;
130                 unsigned long msecs;
131
132                 timersub(&this_read, &last_read, &iv);
133                 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
134
135                 fprintf(stderr, "WARNING: failed to keep up with mmap data."
136                                 "  Last read %lu msecs ago.\n", msecs);
137
138                 /*
139                  * head points to a known good entry, start there.
140                  */
141                 old = head;
142         }
143
144         last_read = this_read;
145
146         if (old != head)
147                 samples++;
148
149         size = head - old;
150
151         if ((old & md->mask) + size != (head & md->mask)) {
152                 buf = &data[old & md->mask];
153                 size = md->mask + 1 - (old & md->mask);
154                 old += size;
155
156                 write_output(buf, size);
157         }
158
159         buf = &data[old & md->mask];
160         size = head - old;
161         old += size;
162
163         write_output(buf, size);
164
165         md->prev = old;
166         mmap_write_tail(md, old);
167 }
168
169 static volatile int done = 0;
170 static volatile int signr = -1;
171
172 static void sig_handler(int sig)
173 {
174         done = 1;
175         signr = sig;
176 }
177
178 static void sig_atexit(void)
179 {
180         if (signr == -1)
181                 return;
182
183         signal(signr, SIG_DFL);
184         kill(getpid(), signr);
185 }
186
187 static void pid_synthesize_comm_event(pid_t pid, int full)
188 {
189         struct comm_event comm_ev;
190         char filename[PATH_MAX];
191         char bf[BUFSIZ];
192         int fd;
193         size_t size;
194         char *field, *sep;
195         DIR *tasks;
196         struct dirent dirent, *next;
197
198         snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
199
200         fd = open(filename, O_RDONLY);
201         if (fd < 0) {
202                 /*
203                  * We raced with a task exiting - just return:
204                  */
205                 if (verbose)
206                         fprintf(stderr, "couldn't open %s\n", filename);
207                 return;
208         }
209         if (read(fd, bf, sizeof(bf)) < 0) {
210                 fprintf(stderr, "couldn't read %s\n", filename);
211                 exit(EXIT_FAILURE);
212         }
213         close(fd);
214
215         /* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
216         memset(&comm_ev, 0, sizeof(comm_ev));
217         field = strchr(bf, '(');
218         if (field == NULL)
219                 goto out_failure;
220         sep = strchr(++field, ')');
221         if (sep == NULL)
222                 goto out_failure;
223         size = sep - field;
224         memcpy(comm_ev.comm, field, size++);
225
226         comm_ev.pid = pid;
227         comm_ev.header.type = PERF_EVENT_COMM;
228         size = ALIGN(size, sizeof(u64));
229         comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
230
231         if (!full) {
232                 comm_ev.tid = pid;
233
234                 write_output(&comm_ev, comm_ev.header.size);
235                 return;
236         }
237
238         snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
239
240         tasks = opendir(filename);
241         while (!readdir_r(tasks, &dirent, &next) && next) {
242                 char *end;
243                 pid = strtol(dirent.d_name, &end, 10);
244                 if (*end)
245                         continue;
246
247                 comm_ev.tid = pid;
248
249                 write_output(&comm_ev, comm_ev.header.size);
250         }
251         closedir(tasks);
252         return;
253
254 out_failure:
255         fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
256                 filename);
257         exit(EXIT_FAILURE);
258 }
259
260 static void pid_synthesize_mmap_samples(pid_t pid)
261 {
262         char filename[PATH_MAX];
263         FILE *fp;
264
265         snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
266
267         fp = fopen(filename, "r");
268         if (fp == NULL) {
269                 /*
270                  * We raced with a task exiting - just return:
271                  */
272                 if (verbose)
273                         fprintf(stderr, "couldn't open %s\n", filename);
274                 return;
275         }
276         while (1) {
277                 char bf[BUFSIZ], *pbf = bf;
278                 struct mmap_event mmap_ev = {
279                         .header = { .type = PERF_EVENT_MMAP },
280                 };
281                 int n;
282                 size_t size;
283                 if (fgets(bf, sizeof(bf), fp) == NULL)
284                         break;
285
286                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
287                 n = hex2u64(pbf, &mmap_ev.start);
288                 if (n < 0)
289                         continue;
290                 pbf += n + 1;
291                 n = hex2u64(pbf, &mmap_ev.len);
292                 if (n < 0)
293                         continue;
294                 pbf += n + 3;
295                 if (*pbf == 'x') { /* vm_exec */
296                         char *execname = strchr(bf, '/');
297
298                         /* Catch VDSO */
299                         if (execname == NULL)
300                                 execname = strstr(bf, "[vdso]");
301
302                         if (execname == NULL)
303                                 continue;
304
305                         size = strlen(execname);
306                         execname[size - 1] = '\0'; /* Remove \n */
307                         memcpy(mmap_ev.filename, execname, size);
308                         size = ALIGN(size, sizeof(u64));
309                         mmap_ev.len -= mmap_ev.start;
310                         mmap_ev.header.size = (sizeof(mmap_ev) -
311                                                (sizeof(mmap_ev.filename) - size));
312                         mmap_ev.pid = pid;
313                         mmap_ev.tid = pid;
314
315                         write_output(&mmap_ev, mmap_ev.header.size);
316                 }
317         }
318
319         fclose(fp);
320 }
321
322 static void synthesize_all(void)
323 {
324         DIR *proc;
325         struct dirent dirent, *next;
326
327         proc = opendir("/proc");
328
329         while (!readdir_r(proc, &dirent, &next) && next) {
330                 char *end;
331                 pid_t pid;
332
333                 pid = strtol(dirent.d_name, &end, 10);
334                 if (*end) /* only interested in proper numerical dirents */
335                         continue;
336
337                 pid_synthesize_comm_event(pid, 1);
338                 pid_synthesize_mmap_samples(pid);
339         }
340
341         closedir(proc);
342 }
343
344 static int group_fd;
345
346 static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
347 {
348         struct perf_header_attr *h_attr;
349
350         if (nr < header->attrs) {
351                 h_attr = header->attr[nr];
352         } else {
353                 h_attr = perf_header_attr__new(a);
354                 perf_header__add_attr(header, h_attr);
355         }
356
357         return h_attr;
358 }
359
360 static void create_counter(int counter, int cpu, pid_t pid)
361 {
362         struct perf_counter_attr *attr = attrs + counter;
363         struct perf_header_attr *h_attr;
364         int track = !counter; /* only the first counter needs these */
365         struct {
366                 u64 count;
367                 u64 time_enabled;
368                 u64 time_running;
369                 u64 id;
370         } read_data;
371
372         attr->read_format       = PERF_FORMAT_TOTAL_TIME_ENABLED |
373                                   PERF_FORMAT_TOTAL_TIME_RUNNING |
374                                   PERF_FORMAT_ID;
375
376         attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
377
378         if (freq) {
379                 attr->sample_type       |= PERF_SAMPLE_PERIOD;
380                 attr->freq              = 1;
381                 attr->sample_freq       = freq;
382         }
383
384         if (no_samples)
385                 attr->sample_freq = 0;
386
387         if (inherit_stat)
388                 attr->inherit_stat = 1;
389
390         if (sample_address)
391                 attr->sample_type       |= PERF_SAMPLE_ADDR;
392
393         if (call_graph)
394                 attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
395
396
397         attr->mmap              = track;
398         attr->comm              = track;
399         attr->inherit           = (cpu < 0) && inherit;
400         attr->disabled          = 1;
401
402 try_again:
403         fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
404
405         if (fd[nr_cpu][counter] < 0) {
406                 int err = errno;
407
408                 if (err == EPERM)
409                         die("Permission error - are you root?\n");
410
411                 /*
412                  * If it's cycles then fall back to hrtimer
413                  * based cpu-clock-tick sw counter, which
414                  * is always available even if no PMU support:
415                  */
416                 if (attr->type == PERF_TYPE_HARDWARE
417                         && attr->config == PERF_COUNT_HW_CPU_CYCLES) {
418
419                         if (verbose)
420                                 warning(" ... trying to fall back to cpu-clock-ticks\n");
421                         attr->type = PERF_TYPE_SOFTWARE;
422                         attr->config = PERF_COUNT_SW_CPU_CLOCK;
423                         goto try_again;
424                 }
425                 printf("\n");
426                 error("perfcounter syscall returned with %d (%s)\n",
427                         fd[nr_cpu][counter], strerror(err));
428                 die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
429                 exit(-1);
430         }
431
432         h_attr = get_header_attr(attr, counter);
433
434         if (!file_new) {
435                 if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
436                         fprintf(stderr, "incompatible append\n");
437                         exit(-1);
438                 }
439         }
440
441         if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
442                 perror("Unable to read perf file descriptor\n");
443                 exit(-1);
444         }
445
446         perf_header_attr__add_id(h_attr, read_data.id);
447
448         assert(fd[nr_cpu][counter] >= 0);
449         fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
450
451         /*
452          * First counter acts as the group leader:
453          */
454         if (group && group_fd == -1)
455                 group_fd = fd[nr_cpu][counter];
456
457         event_array[nr_poll].fd = fd[nr_cpu][counter];
458         event_array[nr_poll].events = POLLIN;
459         nr_poll++;
460
461         mmap_array[nr_cpu][counter].counter = counter;
462         mmap_array[nr_cpu][counter].prev = 0;
463         mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
464         mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
465                         PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
466         if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
467                 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
468                 exit(-1);
469         }
470
471         ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
472 }
473
474 static void open_counters(int cpu, pid_t pid)
475 {
476         int counter;
477
478         group_fd = -1;
479         for (counter = 0; counter < nr_counters; counter++)
480                 create_counter(counter, cpu, pid);
481
482         nr_cpu++;
483 }
484
485 static void atexit_header(void)
486 {
487         header->data_size += bytes_written;
488
489         perf_header__write(header, output);
490 }
491
492 static int __cmd_record(int argc, const char **argv)
493 {
494         int i, counter;
495         struct stat st;
496         pid_t pid = 0;
497         int flags;
498         int ret;
499
500         page_size = sysconf(_SC_PAGE_SIZE);
501         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
502         assert(nr_cpus <= MAX_NR_CPUS);
503         assert(nr_cpus >= 0);
504
505         atexit(sig_atexit);
506         signal(SIGCHLD, sig_handler);
507         signal(SIGINT, sig_handler);
508
509         if (!stat(output_name, &st) && st.st_size) {
510                 if (!force && !append_file) {
511                         fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
512                                         output_name);
513                         exit(-1);
514                 }
515         } else {
516                 append_file = 0;
517         }
518
519         flags = O_CREAT|O_RDWR;
520         if (append_file)
521                 file_new = 0;
522         else
523                 flags |= O_TRUNC;
524
525         output = open(output_name, flags, S_IRUSR|S_IWUSR);
526         if (output < 0) {
527                 perror("failed to create output file");
528                 exit(-1);
529         }
530
531         if (!file_new)
532                 header = perf_header__read(output);
533         else
534                 header = perf_header__new();
535
536         atexit(atexit_header);
537
538         if (!system_wide) {
539                 pid = target_pid;
540                 if (pid == -1)
541                         pid = getpid();
542
543                 open_counters(-1, pid);
544         } else for (i = 0; i < nr_cpus; i++)
545                 open_counters(i, target_pid);
546
547         if (file_new)
548                 perf_header__write(header, output);
549
550         if (!system_wide) {
551                 pid_synthesize_comm_event(pid, 0);
552                 pid_synthesize_mmap_samples(pid);
553         } else
554                 synthesize_all();
555
556         if (target_pid == -1 && argc) {
557                 pid = fork();
558                 if (pid < 0)
559                         perror("failed to fork");
560
561                 if (!pid) {
562                         if (execvp(argv[0], (char **)argv)) {
563                                 perror(argv[0]);
564                                 exit(-1);
565                         }
566                 }
567         }
568
569         if (realtime_prio) {
570                 struct sched_param param;
571
572                 param.sched_priority = realtime_prio;
573                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
574                         printf("Could not set realtime priority.\n");
575                         exit(-1);
576                 }
577         }
578
579         for (;;) {
580                 int hits = samples;
581
582                 for (i = 0; i < nr_cpu; i++) {
583                         for (counter = 0; counter < nr_counters; counter++)
584                                 mmap_read(&mmap_array[i][counter]);
585                 }
586
587                 if (hits == samples) {
588                         if (done)
589                                 break;
590                         ret = poll(event_array, nr_poll, 100);
591                 }
592         }
593
594         /*
595          * Approximate RIP event size: 24 bytes.
596          */
597         fprintf(stderr,
598                 "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
599                 (double)bytes_written / 1024.0 / 1024.0,
600                 output_name,
601                 bytes_written / 24);
602
603         return 0;
604 }
605
606 static const char * const record_usage[] = {
607         "perf record [<options>] [<command>]",
608         "perf record [<options>] -- <command> [<options>]",
609         NULL
610 };
611
612 static const struct option options[] = {
613         OPT_CALLBACK('e', "event", NULL, "event",
614                      "event selector. use 'perf list' to list available events",
615                      parse_events),
616         OPT_INTEGER('p', "pid", &target_pid,
617                     "record events on existing pid"),
618         OPT_INTEGER('r', "realtime", &realtime_prio,
619                     "collect data with this RT SCHED_FIFO priority"),
620         OPT_BOOLEAN('a', "all-cpus", &system_wide,
621                             "system-wide collection from all CPUs"),
622         OPT_BOOLEAN('A', "append", &append_file,
623                             "append to the output file to do incremental profiling"),
624         OPT_BOOLEAN('f', "force", &force,
625                         "overwrite existing data file"),
626         OPT_LONG('c', "count", &default_interval,
627                     "event period to sample"),
628         OPT_STRING('o', "output", &output_name, "file",
629                     "output file name"),
630         OPT_BOOLEAN('i', "inherit", &inherit,
631                     "child tasks inherit counters"),
632         OPT_INTEGER('F', "freq", &freq,
633                     "profile at this frequency"),
634         OPT_INTEGER('m', "mmap-pages", &mmap_pages,
635                     "number of mmap data pages"),
636         OPT_BOOLEAN('g', "call-graph", &call_graph,
637                     "do call-graph (stack chain/backtrace) recording"),
638         OPT_BOOLEAN('v', "verbose", &verbose,
639                     "be more verbose (show counter open errors, etc)"),
640         OPT_BOOLEAN('s', "stat", &inherit_stat,
641                     "per thread counts"),
642         OPT_BOOLEAN('d', "data", &sample_address,
643                     "Sample addresses"),
644         OPT_BOOLEAN('n', "no-samples", &no_samples,
645                     "don't sample"),
646         OPT_END()
647 };
648
649 int cmd_record(int argc, const char **argv, const char *prefix __used)
650 {
651         int counter;
652
653         argc = parse_options(argc, argv, options, record_usage,
654                 PARSE_OPT_STOP_AT_NON_OPTION);
655         if (!argc && target_pid == -1 && !system_wide)
656                 usage_with_options(record_usage, options);
657
658         if (!nr_counters) {
659                 nr_counters     = 1;
660                 attrs[0].type   = PERF_TYPE_HARDWARE;
661                 attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
662         }
663
664         for (counter = 0; counter < nr_counters; counter++) {
665                 if (attrs[counter].sample_period)
666                         continue;
667
668                 attrs[counter].sample_period = default_interval;
669         }
670
671         return __cmd_record(argc, argv);
672 }