perf stat: Analyze front-end and back-end stall counts
Ingo Molnar [Fri, 29 Apr 2011 11:49:08 +0000 (13:49 +0200)]
Sample output:

 Performance counter stats for './loop_1b':

        873.691065 task-clock               #    1.000 CPUs utilized
                 1 context-switches         #    0.000 M/sec
                 1 CPU-migrations           #    0.000 M/sec
                96 page-faults              #    0.000 M/sec
     2,012,637,222 cycles                   #    2.304 GHz                      (66.58%)
     1,001,397,911 stalled-cycles-frontend  #   49.76% frontend cycles idle     (66.58%)
         7,523,398 stalled-cycles-backend   #    0.37%  backend cycles idle     (66.76%)
     2,004,551,046 instructions             #    1.00  insns per cycle
                                            #    0.50  stalled cycles per insn  (66.80%)
     1,001,304,992 branches                 # 1146.063 M/sec                    (66.76%)
            39,453 branch-misses            #    0.00% of all branches          (66.64%)

        0.874046121  seconds time elapsed

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-7y40wib8n003io7hjpn1dsrm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

tools/perf/builtin-stat.c
tools/perf/util/parse-events.c

index 6a4a8a3..e454499 100644 (file)
@@ -201,7 +201,8 @@ static double stddev_stats(struct stats *stats)
 
 struct stats                   runtime_nsecs_stats[MAX_NR_CPUS];
 struct stats                   runtime_cycles_stats[MAX_NR_CPUS];
-struct stats                   runtime_stalled_cycles_stats[MAX_NR_CPUS];
+struct stats                   runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
+struct stats                   runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
 struct stats                   runtime_branches_stats[MAX_NR_CPUS];
 struct stats                   runtime_cacherefs_stats[MAX_NR_CPUS];
 struct stats                   runtime_l1_dcache_stats[MAX_NR_CPUS];
@@ -251,8 +252,10 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
                update_stats(&runtime_nsecs_stats[0], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
                update_stats(&runtime_cycles_stats[0], count[0]);
+       else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
+               update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-               update_stats(&runtime_stalled_cycles_stats[0], count[0]);
+               update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
                update_stats(&runtime_branches_stats[0], count[0]);
        else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
@@ -478,7 +481,30 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
                fprintf(stderr, " # %8.3f CPUs utilized          ", avg / avg_stats(&walltime_nsecs_stats));
 }
 
-static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg)
+static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
+{
+       double total, ratio = 0.0;
+       const char *color;
+
+       total = avg_stats(&runtime_cycles_stats[cpu]);
+
+       if (total)
+               ratio = avg / total * 100.0;
+
+       color = PERF_COLOR_NORMAL;
+       if (ratio > 75.0)
+               color = PERF_COLOR_RED;
+       else if (ratio > 50.0)
+               color = PERF_COLOR_MAGENTA;
+       else if (ratio > 20.0)
+               color = PERF_COLOR_YELLOW;
+
+       fprintf(stderr, " #   ");
+       color_fprintf(stderr, color, "%5.2f%%", ratio);
+       fprintf(stderr, " frontend cycles idle   ");
+}
+
+static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
 {
        double total, ratio = 0.0;
        const char *color;
@@ -498,7 +524,7 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl
 
        fprintf(stderr, " #   ");
        color_fprintf(stderr, color, "%5.2f%%", ratio);
-       fprintf(stderr, " of all cycles are idle ");
+       fprintf(stderr, "  backend cycles idle   ");
 }
 
 static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
@@ -583,7 +609,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
 
                fprintf(stderr, " #    %4.2f  insns per cycle        ", ratio);
 
-               total = avg_stats(&runtime_stalled_cycles_stats[cpu]);
+               total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
+               total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
 
                if (total && avg) {
                        ratio = total / avg;
@@ -609,8 +636,10 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
 
                fprintf(stderr, " # %8.3f %% of all cache refs    ", ratio);
 
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
+               print_stalled_cycles_frontend(cpu, evsel, avg);
        } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-               print_stalled_cycles(cpu, evsel, avg);
+               print_stalled_cycles_backend(cpu, evsel, avg);
        } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
                total = avg_stats(&runtime_nsecs_stats[cpu]);
 
index 04d2f0a..8a407f3 100644 (file)
@@ -60,7 +60,7 @@ static struct event_symbol event_symbols[] = {
 #define PERF_EVENT_TYPE(config)                __PERF_EVENT_FIELD(config, TYPE)
 #define PERF_EVENT_ID(config)          __PERF_EVENT_FIELD(config, EVENT)
 
-static const char *hw_event_names[] = {
+static const char *hw_event_names[PERF_COUNT_HW_MAX] = {
        "cycles",
        "instructions",
        "cache-references",
@@ -68,10 +68,11 @@ static const char *hw_event_names[] = {
        "branches",
        "branch-misses",
        "bus-cycles",
-       "stalled-cycles",
+       "stalled-cycles-frontend",
+       "stalled-cycles-backend",
 };
 
-static const char *sw_event_names[] = {
+static const char *sw_event_names[PERF_COUNT_SW_MAX] = {
        "cpu-clock",
        "task-clock",
        "page-faults",