perf stat: Add -d -d and -d -d -d options to show more CPU events
[linux-3.10.git] / tools / perf / builtin-stat.c
1 /*
2  * builtin-stat.c
3  *
4  * Builtin stat command: Give a precise performance counters summary
5  * overview about any workload, CPU or specific PID.
6  *
7  * Sample output:
8
9    $ perf stat ./hackbench 10
10
11   Time: 0.118
12
13   Performance counter stats for './hackbench 10':
14
15        1708.761321 task-clock                #   11.037 CPUs utilized
16             41,190 context-switches          #    0.024 M/sec
17              6,735 CPU-migrations            #    0.004 M/sec
18             17,318 page-faults               #    0.010 M/sec
19      5,205,202,243 cycles                    #    3.046 GHz
20      3,856,436,920 stalled-cycles-frontend   #   74.09% frontend cycles idle
21      1,600,790,871 stalled-cycles-backend    #   30.75% backend  cycles idle
22      2,603,501,247 instructions              #    0.50  insns per cycle
23                                              #    1.48  stalled cycles per insn
24        484,357,498 branches                  #  283.455 M/sec
25          6,388,934 branch-misses             #    1.32% of all branches
26
27         0.154822978  seconds time elapsed
28
29  *
30  * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
31  *
32  * Improvements and fixes by:
33  *
34  *   Arjan van de Ven <arjan@linux.intel.com>
35  *   Yanmin Zhang <yanmin.zhang@intel.com>
36  *   Wu Fengguang <fengguang.wu@intel.com>
37  *   Mike Galbraith <efault@gmx.de>
38  *   Paul Mackerras <paulus@samba.org>
39  *   Jaswinder Singh Rajput <jaswinder@kernel.org>
40  *
41  * Released under the GPL v2. (and only v2, not any later version)
42  */
43
44 #include "perf.h"
45 #include "builtin.h"
46 #include "util/util.h"
47 #include "util/parse-options.h"
48 #include "util/parse-events.h"
49 #include "util/event.h"
50 #include "util/evlist.h"
51 #include "util/evsel.h"
52 #include "util/debug.h"
53 #include "util/color.h"
54 #include "util/header.h"
55 #include "util/cpumap.h"
56 #include "util/thread.h"
57 #include "util/thread_map.h"
58
59 #include <sys/prctl.h>
60 #include <math.h>
61 #include <locale.h>
62
63 #define DEFAULT_SEPARATOR       " "
64
65 static struct perf_event_attr default_attrs[] = {
66
67   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK              },
68   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES        },
69   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS          },
70   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS             },
71
72   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES              },
73   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
74   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND  },
75   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS            },
76   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS     },
77   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES           },
78
79 };
80
81 /*
82  * Detailed stats (-d), covering the L1 and last level data caches:
83  */
84 static struct perf_event_attr detailed_attrs[] = {
85
86   { .type = PERF_TYPE_HW_CACHE,
87     .config =
88          PERF_COUNT_HW_CACHE_L1D                <<  0  |
89         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
90         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
91
92   { .type = PERF_TYPE_HW_CACHE,
93     .config =
94          PERF_COUNT_HW_CACHE_L1D                <<  0  |
95         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
96         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
97
98   { .type = PERF_TYPE_HW_CACHE,
99     .config =
100          PERF_COUNT_HW_CACHE_LL                 <<  0  |
101         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
102         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
103
104   { .type = PERF_TYPE_HW_CACHE,
105     .config =
106          PERF_COUNT_HW_CACHE_LL                 <<  0  |
107         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
108         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
109 };
110
111 /*
112  * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
113  */
114 static struct perf_event_attr very_detailed_attrs[] = {
115
116   { .type = PERF_TYPE_HW_CACHE,
117     .config =
118          PERF_COUNT_HW_CACHE_L1I                <<  0  |
119         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
120         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
121
122   { .type = PERF_TYPE_HW_CACHE,
123     .config =
124          PERF_COUNT_HW_CACHE_L1I                <<  0  |
125         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
126         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
127
128   { .type = PERF_TYPE_HW_CACHE,
129     .config =
130          PERF_COUNT_HW_CACHE_DTLB               <<  0  |
131         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
132         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
133
134   { .type = PERF_TYPE_HW_CACHE,
135     .config =
136          PERF_COUNT_HW_CACHE_DTLB               <<  0  |
137         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
138         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
139
140   { .type = PERF_TYPE_HW_CACHE,
141     .config =
142          PERF_COUNT_HW_CACHE_ITLB               <<  0  |
143         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
144         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
145
146   { .type = PERF_TYPE_HW_CACHE,
147     .config =
148          PERF_COUNT_HW_CACHE_ITLB               <<  0  |
149         (PERF_COUNT_HW_CACHE_OP_READ            <<  8) |
150         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
151
152 };
153
154 /*
155  * Very, very detailed stats (-d -d -d), adding prefetch events:
156  */
157 static struct perf_event_attr very_very_detailed_attrs[] = {
158
159   { .type = PERF_TYPE_HW_CACHE,
160     .config =
161          PERF_COUNT_HW_CACHE_L1D                <<  0  |
162         (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
163         (PERF_COUNT_HW_CACHE_RESULT_ACCESS      << 16)                          },
164
165   { .type = PERF_TYPE_HW_CACHE,
166     .config =
167          PERF_COUNT_HW_CACHE_L1D                <<  0  |
168         (PERF_COUNT_HW_CACHE_OP_PREFETCH        <<  8) |
169         (PERF_COUNT_HW_CACHE_RESULT_MISS        << 16)                          },
170 };
171
172
173
174 struct perf_evlist              *evsel_list;
175
176 static bool                     system_wide                     =  false;
177 static int                      run_idx                         =  0;
178
179 static int                      run_count                       =  1;
180 static bool                     no_inherit                      = false;
181 static bool                     scale                           =  true;
182 static bool                     no_aggr                         = false;
183 static pid_t                    target_pid                      = -1;
184 static pid_t                    target_tid                      = -1;
185 static pid_t                    child_pid                       = -1;
186 static bool                     null_run                        =  false;
187 static int                      detailed_run                    =  0;
188 static bool                     sync_run                        =  false;
189 static bool                     big_num                         =  true;
190 static int                      big_num_opt                     =  -1;
191 static const char               *cpu_list;
192 static const char               *csv_sep                        = NULL;
193 static bool                     csv_output                      = false;
194
195 static volatile int done = 0;
196
197 struct stats
198 {
199         double n, mean, M2;
200 };
201
202 struct perf_stat {
203         struct stats      res_stats[3];
204 };
205
206 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
207 {
208         evsel->priv = zalloc(sizeof(struct perf_stat));
209         return evsel->priv == NULL ? -ENOMEM : 0;
210 }
211
212 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
213 {
214         free(evsel->priv);
215         evsel->priv = NULL;
216 }
217
218 static void update_stats(struct stats *stats, u64 val)
219 {
220         double delta;
221
222         stats->n++;
223         delta = val - stats->mean;
224         stats->mean += delta / stats->n;
225         stats->M2 += delta*(val - stats->mean);
226 }
227
228 static double avg_stats(struct stats *stats)
229 {
230         return stats->mean;
231 }
232
233 /*
234  * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
235  *
236  *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
237  * s^2 = -------------------------------
238  *                  n - 1
239  *
240  * http://en.wikipedia.org/wiki/Stddev
241  *
242  * The std dev of the mean is related to the std dev by:
243  *
244  *             s
245  * s_mean = -------
246  *          sqrt(n)
247  *
248  */
249 static double stddev_stats(struct stats *stats)
250 {
251         double variance = stats->M2 / (stats->n - 1);
252         double variance_mean = variance / stats->n;
253
254         return sqrt(variance_mean);
255 }
256
257 struct stats                    runtime_nsecs_stats[MAX_NR_CPUS];
258 struct stats                    runtime_cycles_stats[MAX_NR_CPUS];
259 struct stats                    runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
260 struct stats                    runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
261 struct stats                    runtime_branches_stats[MAX_NR_CPUS];
262 struct stats                    runtime_cacherefs_stats[MAX_NR_CPUS];
263 struct stats                    runtime_l1_dcache_stats[MAX_NR_CPUS];
264 struct stats                    walltime_nsecs_stats;
265
266 static int create_perf_stat_counter(struct perf_evsel *evsel)
267 {
268         struct perf_event_attr *attr = &evsel->attr;
269
270         if (scale)
271                 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
272                                     PERF_FORMAT_TOTAL_TIME_RUNNING;
273
274         attr->inherit = !no_inherit;
275
276         if (system_wide)
277                 return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false);
278
279         if (target_pid == -1 && target_tid == -1) {
280                 attr->disabled = 1;
281                 attr->enable_on_exec = 1;
282         }
283
284         return perf_evsel__open_per_thread(evsel, evsel_list->threads, false);
285 }
286
287 /*
288  * Does the counter have nsecs as a unit?
289  */
290 static inline int nsec_counter(struct perf_evsel *evsel)
291 {
292         if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
293             perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
294                 return 1;
295
296         return 0;
297 }
298
299 /*
300  * Update various tracking values we maintain to print
301  * more semantic information such as miss/hit ratios,
302  * instruction rates, etc:
303  */
304 static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
305 {
306         if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
307                 update_stats(&runtime_nsecs_stats[0], count[0]);
308         else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
309                 update_stats(&runtime_cycles_stats[0], count[0]);
310         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
311                 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
312         else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
313                 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
314         else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
315                 update_stats(&runtime_branches_stats[0], count[0]);
316         else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
317                 update_stats(&runtime_cacherefs_stats[0], count[0]);
318         else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
319                 update_stats(&runtime_l1_dcache_stats[0], count[0]);
320 }
321
322 /*
323  * Read out the results of a single counter:
324  * aggregate counts across CPUs in system-wide mode
325  */
326 static int read_counter_aggr(struct perf_evsel *counter)
327 {
328         struct perf_stat *ps = counter->priv;
329         u64 *count = counter->counts->aggr.values;
330         int i;
331
332         if (__perf_evsel__read(counter, evsel_list->cpus->nr,
333                                evsel_list->threads->nr, scale) < 0)
334                 return -1;
335
336         for (i = 0; i < 3; i++)
337                 update_stats(&ps->res_stats[i], count[i]);
338
339         if (verbose) {
340                 fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
341                         event_name(counter), count[0], count[1], count[2]);
342         }
343
344         /*
345          * Save the full runtime - to allow normalization during printout:
346          */
347         update_shadow_stats(counter, count);
348
349         return 0;
350 }
351
352 /*
353  * Read out the results of a single counter:
354  * do not aggregate counts across CPUs in system-wide mode
355  */
356 static int read_counter(struct perf_evsel *counter)
357 {
358         u64 *count;
359         int cpu;
360
361         for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
362                 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
363                         return -1;
364
365                 count = counter->counts->cpu[cpu].values;
366
367                 update_shadow_stats(counter, count);
368         }
369
370         return 0;
371 }
372
373 static int run_perf_stat(int argc __used, const char **argv)
374 {
375         unsigned long long t0, t1;
376         struct perf_evsel *counter;
377         int status = 0;
378         int child_ready_pipe[2], go_pipe[2];
379         const bool forks = (argc > 0);
380         char buf;
381
382         if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
383                 perror("failed to create pipes");
384                 exit(1);
385         }
386
387         if (forks) {
388                 if ((child_pid = fork()) < 0)
389                         perror("failed to fork");
390
391                 if (!child_pid) {
392                         close(child_ready_pipe[0]);
393                         close(go_pipe[1]);
394                         fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
395
396                         /*
397                          * Do a dummy execvp to get the PLT entry resolved,
398                          * so we avoid the resolver overhead on the real
399                          * execvp call.
400                          */
401                         execvp("", (char **)argv);
402
403                         /*
404                          * Tell the parent we're ready to go
405                          */
406                         close(child_ready_pipe[1]);
407
408                         /*
409                          * Wait until the parent tells us to go.
410                          */
411                         if (read(go_pipe[0], &buf, 1) == -1)
412                                 perror("unable to read pipe");
413
414                         execvp(argv[0], (char **)argv);
415
416                         perror(argv[0]);
417                         exit(-1);
418                 }
419
420                 if (target_tid == -1 && target_pid == -1 && !system_wide)
421                         evsel_list->threads->map[0] = child_pid;
422
423                 /*
424                  * Wait for the child to be ready to exec.
425                  */
426                 close(child_ready_pipe[1]);
427                 close(go_pipe[0]);
428                 if (read(child_ready_pipe[0], &buf, 1) == -1)
429                         perror("unable to read pipe");
430                 close(child_ready_pipe[0]);
431         }
432
433         list_for_each_entry(counter, &evsel_list->entries, node) {
434                 if (create_perf_stat_counter(counter) < 0) {
435                         if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) {
436                                 if (verbose)
437                                         ui__warning("%s event is not supported by the kernel.\n",
438                                                     event_name(counter));
439                                 continue;
440                         }
441
442                         if (errno == EPERM || errno == EACCES) {
443                                 error("You may not have permission to collect %sstats.\n"
444                                       "\t Consider tweaking"
445                                       " /proc/sys/kernel/perf_event_paranoid or running as root.",
446                                       system_wide ? "system-wide " : "");
447                         } else {
448                                 error("open_counter returned with %d (%s). "
449                                       "/bin/dmesg may provide additional information.\n",
450                                        errno, strerror(errno));
451                         }
452                         if (child_pid != -1)
453                                 kill(child_pid, SIGTERM);
454                         die("Not all events could be opened.\n");
455                         return -1;
456                 }
457         }
458
459         if (perf_evlist__set_filters(evsel_list)) {
460                 error("failed to set filter with %d (%s)\n", errno,
461                         strerror(errno));
462                 return -1;
463         }
464
465         /*
466          * Enable counters and exec the command:
467          */
468         t0 = rdclock();
469
470         if (forks) {
471                 close(go_pipe[1]);
472                 wait(&status);
473         } else {
474                 while(!done) sleep(1);
475         }
476
477         t1 = rdclock();
478
479         update_stats(&walltime_nsecs_stats, t1 - t0);
480
481         if (no_aggr) {
482                 list_for_each_entry(counter, &evsel_list->entries, node) {
483                         read_counter(counter);
484                         perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
485                 }
486         } else {
487                 list_for_each_entry(counter, &evsel_list->entries, node) {
488                         read_counter_aggr(counter);
489                         perf_evsel__close_fd(counter, evsel_list->cpus->nr,
490                                              evsel_list->threads->nr);
491                 }
492         }
493
494         return WEXITSTATUS(status);
495 }
496
497 static void print_noise_pct(double total, double avg)
498 {
499         double pct = 0.0;
500
501         if (avg)
502                 pct = 100.0*total/avg;
503
504         fprintf(stderr, "  ( +-%6.2f%% )", pct);
505 }
506
507 static void print_noise(struct perf_evsel *evsel, double avg)
508 {
509         struct perf_stat *ps;
510
511         if (run_count == 1)
512                 return;
513
514         ps = evsel->priv;
515         print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
516 }
517
518 static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
519 {
520         double msecs = avg / 1e6;
521         char cpustr[16] = { '\0', };
522         const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
523
524         if (no_aggr)
525                 sprintf(cpustr, "CPU%*d%s",
526                         csv_output ? 0 : -4,
527                         evsel_list->cpus->map[cpu], csv_sep);
528
529         fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
530
531         if (evsel->cgrp)
532                 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
533
534         if (csv_output)
535                 return;
536
537         if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
538                 fprintf(stderr, " # %8.3f CPUs utilized          ", avg / avg_stats(&walltime_nsecs_stats));
539 }
540
541 static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
542 {
543         double total, ratio = 0.0;
544         const char *color;
545
546         total = avg_stats(&runtime_cycles_stats[cpu]);
547
548         if (total)
549                 ratio = avg / total * 100.0;
550
551         color = PERF_COLOR_NORMAL;
552         if (ratio > 50.0)
553                 color = PERF_COLOR_RED;
554         else if (ratio > 30.0)
555                 color = PERF_COLOR_MAGENTA;
556         else if (ratio > 10.0)
557                 color = PERF_COLOR_YELLOW;
558
559         fprintf(stderr, " #  ");
560         color_fprintf(stderr, color, "%6.2f%%", ratio);
561         fprintf(stderr, " frontend cycles idle   ");
562 }
563
564 static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
565 {
566         double total, ratio = 0.0;
567         const char *color;
568
569         total = avg_stats(&runtime_cycles_stats[cpu]);
570
571         if (total)
572                 ratio = avg / total * 100.0;
573
574         color = PERF_COLOR_NORMAL;
575         if (ratio > 75.0)
576                 color = PERF_COLOR_RED;
577         else if (ratio > 50.0)
578                 color = PERF_COLOR_MAGENTA;
579         else if (ratio > 20.0)
580                 color = PERF_COLOR_YELLOW;
581
582         fprintf(stderr, " #  ");
583         color_fprintf(stderr, color, "%6.2f%%", ratio);
584         fprintf(stderr, " backend  cycles idle   ");
585 }
586
587 static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
588 {
589         double total, ratio = 0.0;
590         const char *color;
591
592         total = avg_stats(&runtime_branches_stats[cpu]);
593
594         if (total)
595                 ratio = avg / total * 100.0;
596
597         color = PERF_COLOR_NORMAL;
598         if (ratio > 20.0)
599                 color = PERF_COLOR_RED;
600         else if (ratio > 10.0)
601                 color = PERF_COLOR_MAGENTA;
602         else if (ratio > 5.0)
603                 color = PERF_COLOR_YELLOW;
604
605         fprintf(stderr, " #  ");
606         color_fprintf(stderr, color, "%6.2f%%", ratio);
607         fprintf(stderr, " of all branches        ");
608 }
609
610 static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
611 {
612         double total, ratio = 0.0;
613         const char *color;
614
615         total = avg_stats(&runtime_l1_dcache_stats[cpu]);
616
617         if (total)
618                 ratio = avg / total * 100.0;
619
620         color = PERF_COLOR_NORMAL;
621         if (ratio > 20.0)
622                 color = PERF_COLOR_RED;
623         else if (ratio > 10.0)
624                 color = PERF_COLOR_MAGENTA;
625         else if (ratio > 5.0)
626                 color = PERF_COLOR_YELLOW;
627
628         fprintf(stderr, " #  ");
629         color_fprintf(stderr, color, "%6.2f%%", ratio);
630         fprintf(stderr, " of all L1-dcache hits  ");
631 }
632
633 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
634 {
635         double total, ratio = 0.0;
636         char cpustr[16] = { '\0', };
637         const char *fmt;
638
639         if (csv_output)
640                 fmt = "%s%.0f%s%s";
641         else if (big_num)
642                 fmt = "%s%'18.0f%s%-25s";
643         else
644                 fmt = "%s%18.0f%s%-25s";
645
646         if (no_aggr)
647                 sprintf(cpustr, "CPU%*d%s",
648                         csv_output ? 0 : -4,
649                         evsel_list->cpus->map[cpu], csv_sep);
650         else
651                 cpu = 0;
652
653         fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
654
655         if (evsel->cgrp)
656                 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
657
658         if (csv_output)
659                 return;
660
661         if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
662                 total = avg_stats(&runtime_cycles_stats[cpu]);
663
664                 if (total)
665                         ratio = avg / total;
666
667                 fprintf(stderr, " #   %5.2f  insns per cycle        ", ratio);
668
669                 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
670                 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
671
672                 if (total && avg) {
673                         ratio = total / avg;
674                         fprintf(stderr, "\n                                             #   %5.2f  stalled cycles per insn", ratio);
675                 }
676
677         } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
678                         runtime_branches_stats[cpu].n != 0) {
679                 print_branch_misses(cpu, evsel, avg);
680         } else if (
681                 evsel->attr.type == PERF_TYPE_HW_CACHE &&
682                 evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
683                                         ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
684                                         ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
685                         runtime_l1_dcache_stats[cpu].n != 0) {
686                 print_l1_dcache_misses(cpu, evsel, avg);
687         } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
688                         runtime_cacherefs_stats[cpu].n != 0) {
689                 total = avg_stats(&runtime_cacherefs_stats[cpu]);
690
691                 if (total)
692                         ratio = avg * 100 / total;
693
694                 fprintf(stderr, " # %8.3f %% of all cache refs    ", ratio);
695
696         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
697                 print_stalled_cycles_frontend(cpu, evsel, avg);
698         } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
699                 print_stalled_cycles_backend(cpu, evsel, avg);
700         } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
701                 total = avg_stats(&runtime_nsecs_stats[cpu]);
702
703                 if (total)
704                         ratio = 1.0 * avg / total;
705
706                 fprintf(stderr, " # %8.3f GHz                    ", ratio);
707         } else if (runtime_nsecs_stats[cpu].n != 0) {
708                 total = avg_stats(&runtime_nsecs_stats[cpu]);
709
710                 if (total)
711                         ratio = 1000.0 * avg / total;
712
713                 fprintf(stderr, " # %8.3f M/sec                  ", ratio);
714         } else {
715                 fprintf(stderr, "                                   ");
716         }
717 }
718
719 /*
720  * Print out the results of a single counter:
721  * aggregated counts in system-wide mode
722  */
723 static void print_counter_aggr(struct perf_evsel *counter)
724 {
725         struct perf_stat *ps = counter->priv;
726         double avg = avg_stats(&ps->res_stats[0]);
727         int scaled = counter->counts->scaled;
728
729         if (scaled == -1) {
730                 fprintf(stderr, "%*s%s%*s",
731                         csv_output ? 0 : 18,
732                         "<not counted>",
733                         csv_sep,
734                         csv_output ? 0 : -24,
735                         event_name(counter));
736
737                 if (counter->cgrp)
738                         fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
739
740                 fputc('\n', stderr);
741                 return;
742         }
743
744         if (nsec_counter(counter))
745                 nsec_printout(-1, counter, avg);
746         else
747                 abs_printout(-1, counter, avg);
748
749         if (csv_output) {
750                 fputc('\n', stderr);
751                 return;
752         }
753
754         print_noise(counter, avg);
755
756         if (scaled) {
757                 double avg_enabled, avg_running;
758
759                 avg_enabled = avg_stats(&ps->res_stats[1]);
760                 avg_running = avg_stats(&ps->res_stats[2]);
761
762                 fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled);
763         }
764         fprintf(stderr, "\n");
765 }
766
767 /*
768  * Print out the results of a single counter:
769  * does not use aggregated count in system-wide
770  */
771 static void print_counter(struct perf_evsel *counter)
772 {
773         u64 ena, run, val;
774         int cpu;
775
776         for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
777                 val = counter->counts->cpu[cpu].val;
778                 ena = counter->counts->cpu[cpu].ena;
779                 run = counter->counts->cpu[cpu].run;
780                 if (run == 0 || ena == 0) {
781                         fprintf(stderr, "CPU%*d%s%*s%s%*s",
782                                 csv_output ? 0 : -4,
783                                 evsel_list->cpus->map[cpu], csv_sep,
784                                 csv_output ? 0 : 18,
785                                 "<not counted>", csv_sep,
786                                 csv_output ? 0 : -24,
787                                 event_name(counter));
788
789                         if (counter->cgrp)
790                                 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
791
792                         fputc('\n', stderr);
793                         continue;
794                 }
795
796                 if (nsec_counter(counter))
797                         nsec_printout(cpu, counter, val);
798                 else
799                         abs_printout(cpu, counter, val);
800
801                 if (!csv_output) {
802                         print_noise(counter, 1.0);
803
804                         if (run != ena)
805                                 fprintf(stderr, "  (%.2f%%)", 100.0 * run / ena);
806                 }
807                 fputc('\n', stderr);
808         }
809 }
810
811 static void print_stat(int argc, const char **argv)
812 {
813         struct perf_evsel *counter;
814         int i;
815
816         fflush(stdout);
817
818         if (!csv_output) {
819                 fprintf(stderr, "\n");
820                 fprintf(stderr, " Performance counter stats for ");
821                 if(target_pid == -1 && target_tid == -1) {
822                         fprintf(stderr, "\'%s", argv[0]);
823                         for (i = 1; i < argc; i++)
824                                 fprintf(stderr, " %s", argv[i]);
825                 } else if (target_pid != -1)
826                         fprintf(stderr, "process id \'%d", target_pid);
827                 else
828                         fprintf(stderr, "thread id \'%d", target_tid);
829
830                 fprintf(stderr, "\'");
831                 if (run_count > 1)
832                         fprintf(stderr, " (%d runs)", run_count);
833                 fprintf(stderr, ":\n\n");
834         }
835
836         if (no_aggr) {
837                 list_for_each_entry(counter, &evsel_list->entries, node)
838                         print_counter(counter);
839         } else {
840                 list_for_each_entry(counter, &evsel_list->entries, node)
841                         print_counter_aggr(counter);
842         }
843
844         if (!csv_output) {
845                 fprintf(stderr, "\n");
846                 fprintf(stderr, " %18.9f  seconds time elapsed",
847                                 avg_stats(&walltime_nsecs_stats)/1e9);
848                 if (run_count > 1) {
849                         print_noise_pct(stddev_stats(&walltime_nsecs_stats),
850                                         avg_stats(&walltime_nsecs_stats));
851                 }
852                 fprintf(stderr, "\n\n");
853         }
854 }
855
856 static volatile int signr = -1;
857
858 static void skip_signal(int signo)
859 {
860         if(child_pid == -1)
861                 done = 1;
862
863         signr = signo;
864 }
865
866 static void sig_atexit(void)
867 {
868         if (child_pid != -1)
869                 kill(child_pid, SIGTERM);
870
871         if (signr == -1)
872                 return;
873
874         signal(signr, SIG_DFL);
875         kill(getpid(), signr);
876 }
877
878 static const char * const stat_usage[] = {
879         "perf stat [<options>] [<command>]",
880         NULL
881 };
882
883 static int stat__set_big_num(const struct option *opt __used,
884                              const char *s __used, int unset)
885 {
886         big_num_opt = unset ? 0 : 1;
887         return 0;
888 }
889
890 static const struct option options[] = {
891         OPT_CALLBACK('e', "event", &evsel_list, "event",
892                      "event selector. use 'perf list' to list available events",
893                      parse_events),
894         OPT_CALLBACK(0, "filter", &evsel_list, "filter",
895                      "event filter", parse_filter),
896         OPT_BOOLEAN('i', "no-inherit", &no_inherit,
897                     "child tasks do not inherit counters"),
898         OPT_INTEGER('p', "pid", &target_pid,
899                     "stat events on existing process id"),
900         OPT_INTEGER('t', "tid", &target_tid,
901                     "stat events on existing thread id"),
902         OPT_BOOLEAN('a', "all-cpus", &system_wide,
903                     "system-wide collection from all CPUs"),
904         OPT_BOOLEAN('c', "scale", &scale,
905                     "scale/normalize counters"),
906         OPT_INCR('v', "verbose", &verbose,
907                     "be more verbose (show counter open errors, etc)"),
908         OPT_INTEGER('r', "repeat", &run_count,
909                     "repeat command and print average + stddev (max: 100)"),
910         OPT_BOOLEAN('n', "null", &null_run,
911                     "null run - dont start any counters"),
912         OPT_INCR('d', "detailed", &detailed_run,
913                     "detailed run - start a lot of events"),
914         OPT_BOOLEAN('S', "sync", &sync_run,
915                     "call sync() before starting a run"),
916         OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 
917                            "print large numbers with thousands\' separators",
918                            stat__set_big_num),
919         OPT_STRING('C', "cpu", &cpu_list, "cpu",
920                     "list of cpus to monitor in system-wide"),
921         OPT_BOOLEAN('A', "no-aggr", &no_aggr,
922                     "disable CPU count aggregation"),
923         OPT_STRING('x', "field-separator", &csv_sep, "separator",
924                    "print counts with custom separator"),
925         OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
926                      "monitor event in cgroup name only",
927                      parse_cgroups),
928         OPT_END()
929 };
930
931 /*
932  * Add default attributes, if there were no attributes specified or
933  * if -d/--detailed, -d -d or -d -d -d is used:
934  */
935 static int add_default_attributes(void)
936 {
937         struct perf_evsel *pos;
938         size_t attr_nr = 0;
939         size_t c;
940
941         /* Set attrs if no event is selected and !null_run: */
942         if (null_run)
943                 return 0;
944
945         if (!evsel_list->nr_entries) {
946                 for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
947                         pos = perf_evsel__new(default_attrs + c, c + attr_nr);
948                         if (pos == NULL)
949                                 return -1;
950                         perf_evlist__add(evsel_list, pos);
951                 }
952                 attr_nr += c;
953         }
954
955         /* Detailed events get appended to the event list: */
956
957         if (detailed_run <  1)
958                 return 0;
959
960         /* Append detailed run extra attributes: */
961         for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
962                 pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
963                 if (pos == NULL)
964                         return -1;
965                 perf_evlist__add(evsel_list, pos);
966         }
967         attr_nr += c;
968
969         if (detailed_run < 2)
970                 return 0;
971
972         /* Append very detailed run extra attributes: */
973         for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
974                 pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
975                 if (pos == NULL)
976                         return -1;
977                 perf_evlist__add(evsel_list, pos);
978         }
979
980         if (detailed_run < 3)
981                 return 0;
982
983         /* Append very, very detailed run extra attributes: */
984         for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
985                 pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
986                 if (pos == NULL)
987                         return -1;
988                 perf_evlist__add(evsel_list, pos);
989         }
990
991
992         return 0;
993 }
994
995 int cmd_stat(int argc, const char **argv, const char *prefix __used)
996 {
997         struct perf_evsel *pos;
998         int status = -ENOMEM;
999
1000         setlocale(LC_ALL, "");
1001
1002         evsel_list = perf_evlist__new(NULL, NULL);
1003         if (evsel_list == NULL)
1004                 return -ENOMEM;
1005
1006         argc = parse_options(argc, argv, options, stat_usage,
1007                 PARSE_OPT_STOP_AT_NON_OPTION);
1008
1009         if (csv_sep)
1010                 csv_output = true;
1011         else
1012                 csv_sep = DEFAULT_SEPARATOR;
1013
1014         /*
1015          * let the spreadsheet do the pretty-printing
1016          */
1017         if (csv_output) {
1018                 /* User explicitely passed -B? */
1019                 if (big_num_opt == 1) {
1020                         fprintf(stderr, "-B option not supported with -x\n");
1021                         usage_with_options(stat_usage, options);
1022                 } else /* Nope, so disable big number formatting */
1023                         big_num = false;
1024         } else if (big_num_opt == 0) /* User passed --no-big-num */
1025                 big_num = false;
1026
1027         if (!argc && target_pid == -1 && target_tid == -1)
1028                 usage_with_options(stat_usage, options);
1029         if (run_count <= 0)
1030                 usage_with_options(stat_usage, options);
1031
1032         /* no_aggr, cgroup are for system-wide only */
1033         if ((no_aggr || nr_cgroups) && !system_wide) {
1034                 fprintf(stderr, "both cgroup and no-aggregation "
1035                         "modes only available in system-wide mode\n");
1036
1037                 usage_with_options(stat_usage, options);
1038         }
1039
1040         if (add_default_attributes())
1041                 goto out;
1042
1043         if (target_pid != -1)
1044                 target_tid = target_pid;
1045
1046         evsel_list->threads = thread_map__new(target_pid, target_tid);
1047         if (evsel_list->threads == NULL) {
1048                 pr_err("Problems finding threads of monitor\n");
1049                 usage_with_options(stat_usage, options);
1050         }
1051
1052         if (system_wide)
1053                 evsel_list->cpus = cpu_map__new(cpu_list);
1054         else
1055                 evsel_list->cpus = cpu_map__dummy_new();
1056
1057         if (evsel_list->cpus == NULL) {
1058                 perror("failed to parse CPUs map");
1059                 usage_with_options(stat_usage, options);
1060                 return -1;
1061         }
1062
1063         list_for_each_entry(pos, &evsel_list->entries, node) {
1064                 if (perf_evsel__alloc_stat_priv(pos) < 0 ||
1065                     perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
1066                     perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
1067                         goto out_free_fd;
1068         }
1069
1070         /*
1071          * We dont want to block the signals - that would cause
1072          * child tasks to inherit that and Ctrl-C would not work.
1073          * What we want is for Ctrl-C to work in the exec()-ed
1074          * task, but being ignored by perf stat itself:
1075          */
1076         atexit(sig_atexit);
1077         signal(SIGINT,  skip_signal);
1078         signal(SIGALRM, skip_signal);
1079         signal(SIGABRT, skip_signal);
1080
1081         status = 0;
1082         for (run_idx = 0; run_idx < run_count; run_idx++) {
1083                 if (run_count != 1 && verbose)
1084                         fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
1085
1086                 if (sync_run)
1087                         sync();
1088
1089                 status = run_perf_stat(argc, argv);
1090         }
1091
1092         if (status != -1)
1093                 print_stat(argc, argv);
1094 out_free_fd:
1095         list_for_each_entry(pos, &evsel_list->entries, node)
1096                 perf_evsel__free_stat_priv(pos);
1097         perf_evlist__delete_maps(evsel_list);
1098 out:
1099         perf_evlist__delete(evsel_list);
1100         return status;
1101 }