misc: tegra-profiler: support multiple PMUs
Alexey Kravets [Wed, 7 Oct 2015 11:47:24 +0000 (14:47 +0300)]
Support multiple PMUs in tegra profiler module.
Allow per-CPU capabilities query and per-CPU PMU setup.

Bug 1694191
Bug 200142156

(cherry picked from commit 3f18b6372263fb76aebfc66a5bbe76c44c4a5daa)
Change-Id: I10e1779aa76814b1615610f1acdb700875349607
Signed-off-by: Alexey Kravets <akravets@nvidia.com>
Reviewed-on: http://git-master/r/819795
(cherry picked from commit 7bf627d61f4b724c22d02aef609fd60507a2b593)
Reviewed-on: http://git-master/r/824265
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Bharat Nihalani <bnihalani@nvidia.com>
Reviewed-on: http://git-master/r/1175197
Tested-by: Anatoly Nikiforov <anikiforov@nvidia.com>

16 files changed:
drivers/misc/tegra-profiler/Kconfig
drivers/misc/tegra-profiler/armv7_pmu.c
drivers/misc/tegra-profiler/armv8_pmu.c
drivers/misc/tegra-profiler/comm.c
drivers/misc/tegra-profiler/comm.h
drivers/misc/tegra-profiler/debug.h
drivers/misc/tegra-profiler/disassembler.c
drivers/misc/tegra-profiler/eh_unwind.c
drivers/misc/tegra-profiler/hrt.c
drivers/misc/tegra-profiler/main.c
drivers/misc/tegra-profiler/pl310.c
drivers/misc/tegra-profiler/quadd.h
drivers/misc/tegra-profiler/quadd_proc.c
drivers/misc/tegra-profiler/tegra.h
drivers/misc/tegra-profiler/version.h
include/linux/tegra_profiler.h

index d07cb3c..c320906 100644 (file)
@@ -1,5 +1,8 @@
 config TEGRA_PROFILER
        bool "Enable Tegra profiler"
-       depends on ARCH_TEGRA
+       depends on ARCH_TEGRA || PLATFORM_TEGRA
        help
          This option enables Tegra profiler
+         The main responsibility of this module is sampling
+         This module creates /dev/quadd device which is essential for
+         Tegra profiler work
index 1962a3e..d494ab0 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/armv7_pmu.c
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -20,7 +20,8 @@
 #include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <asm/cputype.h>
-#include <asm/pmu.h>
+#include <linux/pmu.h>
+#include <linux/cpu.h>
 
 #include <linux/tegra_profiler.h>
 
@@ -30,7 +31,7 @@
 #include "quadd.h"
 #include "debug.h"
 
-static struct quadd_pmu_ctx pmu_ctx;
+static DEFINE_PER_CPU(struct quadd_pmu_ctx, pmu_ctx);
 
 enum {
        QUADD_ARM_CPU_TYPE_UNKNOWN,
@@ -237,11 +238,13 @@ static void select_counter(unsigned int counter)
 
 static int is_pmu_enabled(void)
 {
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 pmnc = armv7_pmu_pmnc_read();
 
        if (pmnc & QUADD_ARMV7_PMNC_E) {
                u32 cnten = armv7_pmu_cntens_read();
-               cnten &= pmu_ctx.counters_mask | QUADD_ARMV7_CCNT;
+
+               cnten &= local_pmu_ctx->counters_mask | QUADD_ARMV7_CCNT;
                return cnten ? 1 : 0;
        }
 
@@ -278,8 +281,10 @@ get_free_counters(unsigned long *bitmap, int nbits, int *ccntr)
        int cc;
        u32 cntens;
 
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+
        cntens = armv7_pmu_cntens_read();
-       cntens = ~cntens & (pmu_ctx.counters_mask | QUADD_ARMV7_CCNT);
+       cntens = ~cntens & (local_pmu_ctx->counters_mask | QUADD_ARMV7_CCNT);
 
        bitmap_zero(bitmap, nbits);
        bitmap_copy(bitmap, (unsigned long *)&cntens,
@@ -295,14 +300,15 @@ get_free_counters(unsigned long *bitmap, int nbits, int *ccntr)
 
 static u32 armv7_pmu_adjust_value(u32 value, int event_id)
 {
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        /*
        * Cortex A8/A9: l1 cache performance counters
        * don't differentiate between read and write data accesses/misses,
        * so currently we are devided by two
        */
-       if (pmu_ctx.l1_cache_rw &&
-           (pmu_ctx.arch.type == QUADD_ARM_CPU_TYPE_CORTEX_A8 ||
-           pmu_ctx.arch.type == QUADD_ARM_CPU_TYPE_CORTEX_A9) &&
+       if (local_pmu_ctx->l1_cache_rw &&
+           (local_pmu_ctx->arch.type == QUADD_ARM_CPU_TYPE_CORTEX_A8 ||
+           local_pmu_ctx->arch.type == QUADD_ARM_CPU_TYPE_CORTEX_A9) &&
            (event_id == QUADD_EVENT_TYPE_L1_DCACHE_READ_MISSES ||
            event_id == QUADD_EVENT_TYPE_L1_DCACHE_WRITE_MISSES)) {
                return value / 2;
@@ -319,14 +325,18 @@ disable_interrupt(int idx)
 static void
 disable_all_interrupts(void)
 {
-       u32 val = QUADD_ARMV7_CCNT | pmu_ctx.counters_mask;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+       u32 val = QUADD_ARMV7_CCNT | local_pmu_ctx->counters_mask;
+
        armv7_pmu_intenc_write(val);
 }
 
 static void
 armv7_pmnc_reset_overflow_flags(void)
 {
-       u32 val = QUADD_ARMV7_CCNT | pmu_ctx.counters_mask;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+       u32 val = QUADD_ARMV7_CCNT | local_pmu_ctx->counters_mask;
+
        asm volatile("mcr p15, 0, %0, c9, c12, 3" : : "r" (val));
 }
 
@@ -340,13 +350,14 @@ select_event(unsigned int idx, unsigned int event)
 static void disable_all_counters(void)
 {
        u32 val;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
 
        /* Disable all counters */
        val = armv7_pmu_pmnc_read();
        if (val & QUADD_ARMV7_PMNC_E)
                armv7_pmu_pmnc_write(val & ~QUADD_ARMV7_PMNC_E);
 
-       armv7_pmu_cntenc_write(QUADD_ARMV7_CCNT | pmu_ctx.counters_mask);
+       armv7_pmu_cntenc_write(QUADD_ARMV7_CCNT | local_pmu_ctx->counters_mask);
 }
 
 static void enable_all_counters(void)
@@ -415,6 +426,7 @@ static void pmu_start(void)
        u32 event;
        DECLARE_BITMAP(free_bitmap, QUADD_MAX_PMU_COUNTERS);
        struct quadd_pmu_info *pi = &__get_cpu_var(cpu_pmu_info);
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 *prevp = pi->prev_vals;
        struct quadd_pmu_event_info *ei;
 
@@ -431,7 +443,7 @@ static void pmu_start(void)
 
        pcntrs = get_free_counters(free_bitmap, QUADD_MAX_PMU_COUNTERS, &ccntr);
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                int index;
 
                *prevp++ = 0;
@@ -497,6 +509,7 @@ pmu_read(struct event_data *events, int max_events)
        u32 val;
        int idx = 0, i = 0;
        struct quadd_pmu_info *pi = &__get_cpu_var(cpu_pmu_info);
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 *prevp = pi->prev_vals;
        struct quadd_pmu_event_info *ei;
 
@@ -505,7 +518,7 @@ pmu_read(struct event_data *events, int max_events)
                return 0;
        }
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                int index;
 
                if (ei->quadd_event_id == QUADD_EVENT_TYPE_CPU_CYCLES) {
@@ -555,10 +568,11 @@ pmu_read_emulate(struct event_data *events, int max_events)
        int i = 0;
        static u32 val = 100;
        struct quadd_pmu_info *pi = &__get_cpu_var(cpu_pmu_info);
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 *prevp = pi->prev_vals;
        struct quadd_pmu_event_info *ei;
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                if (val > 200)
                        val = 100;
 
@@ -609,20 +623,22 @@ static void free_events(struct list_head *head)
        }
 }
 
-static int set_events(int *events, int size)
+static int set_events(int cpuid, int *events, int size)
 {
        int free_pcntrs, err;
        int i, nr_l1_r = 0, nr_l1_w = 0;
        struct quadd_cntrs_info free_ci;
 
-       pmu_ctx.l1_cache_rw = 0;
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
+       local_pmu_ctx->l1_cache_rw = 0;
 
-       free_events(&pmu_ctx.used_events);
+       free_events(&local_pmu_ctx->used_events);
 
        if (!events || !size)
                return 0;
 
-       if (!pmu_ctx.current_map) {
+       if (!local_pmu_ctx->current_map) {
                pr_err("Invalid current_map\n");
                return -ENODEV;
        }
@@ -653,7 +669,7 @@ static int set_events(int *events, int size)
                }
 
                INIT_LIST_HEAD(&ei->list);
-               list_add_tail(&ei->list, &pmu_ctx.used_events);
+               list_add_tail(&ei->list, &local_pmu_ctx->used_events);
 
                if (events[i] == QUADD_EVENT_TYPE_CPU_CYCLES) {
                        ei->hw_value = QUADD_ARMV7_CPU_CYCLE_EVENT;
@@ -669,7 +685,7 @@ static int set_events(int *events, int size)
                                goto out_free;
                        }
 
-                       ei->hw_value = pmu_ctx.current_map[events[i]];
+                       ei->hw_value = local_pmu_ctx->current_map[events[i]];
                }
 
                ei->quadd_event_id = events[i];
@@ -685,34 +701,38 @@ static int set_events(int *events, int size)
        }
 
        if (nr_l1_r > 0 && nr_l1_w > 0)
-               pmu_ctx.l1_cache_rw = 1;
+               local_pmu_ctx->l1_cache_rw = 1;
 
        return 0;
 
 out_free:
-       free_events(&pmu_ctx.used_events);
+       free_events(&local_pmu_ctx->used_events);
        return err;
 }
 
-static int get_supported_events(int *events, int max_events)
+static int get_supported_events(int cpuid, int *events, int max_events)
 {
        int i, nr_events = 0;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
 
        max_events = min_t(int, QUADD_EVENT_TYPE_MAX, max_events);
 
        for (i = 0; i < max_events; i++) {
-               if (pmu_ctx.current_map[i] != QUADD_ARMV7_UNSUPPORTED_EVENT)
+               unsigned int event = local_pmu_ctx->current_map[i];
+
+               if (event != QUADD_ARMV7_UNSUPPORTED_EVENT)
                        events[nr_events++] = i;
        }
        return nr_events;
 }
 
-static int get_current_events(int *events, int max_events)
+static int get_current_events(int cpuid, int *events, int max_events)
 {
        int i = 0;
        struct quadd_pmu_event_info *ei;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                events[i++] = ei->quadd_event_id;
 
                if (i >= max_events)
@@ -722,9 +742,11 @@ static int get_current_events(int *events, int max_events)
        return i;
 }
 
-static struct quadd_arch_info *get_arch(void)
+static struct quadd_arch_info *get_arch(int cpuid)
 {
-       return &pmu_ctx.arch;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+
+       return &local_pmu_ctx->arch;
 }
 
 static struct quadd_event_source_interface pmu_armv7_int = {
@@ -745,61 +767,96 @@ static struct quadd_event_source_interface pmu_armv7_int = {
        .get_arch               = get_arch,
 };
 
-struct quadd_event_source_interface *quadd_armv7_pmu_init(void)
+struct quadd_event_source_interface *quadd_armv7_pmu_init_for_cpu(int cpuid)
 {
        struct quadd_event_source_interface *pmu = NULL;
        unsigned long cpu_id, cpu_implementer, part_number;
 
-       cpu_id = read_cpuid_id();
+       struct cpuinfo_arm *local_cpu_data = &per_cpu(cpu_data, cpuid);
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
+       cpu_id = local_cpu_data->cpuid;
        cpu_implementer = cpu_id >> 24;
        part_number = cpu_id & 0xFFF0;
 
-       pmu_ctx.arch.type = QUADD_ARM_CPU_TYPE_UNKNOWN;
-       pmu_ctx.arch.ver = 0;
-       strncpy(pmu_ctx.arch.name, "Unknown",
-               sizeof(pmu_ctx.arch.name));
+       local_pmu_ctx->arch.type = QUADD_ARM_CPU_TYPE_UNKNOWN;
+       local_pmu_ctx->arch.ver = 0;
+       strncpy(local_pmu_ctx->arch.name, "Unknown",
+               sizeof(local_pmu_ctx->arch.name));
 
        if (cpu_implementer == ARM_CPU_IMP_ARM) {
                switch (part_number) {
                case ARM_CPU_PART_CORTEX_A9:
-                       pmu_ctx.arch.type = QUADD_ARM_CPU_TYPE_CORTEX_A9;
-                       strncpy(pmu_ctx.arch.name, "Cortex A9",
-                               sizeof(pmu_ctx.arch.name));
+                       local_pmu_ctx->arch.type = QUADD_ARM_CPU_TYPE_CORTEX_A9;
+                       strncpy(local_pmu_ctx->arch.name, "Cortex A9",
+                               sizeof(local_pmu_ctx->arch.name));
 
-                       pmu_ctx.counters_mask =
+                       local_pmu_ctx->counters_mask =
                                QUADD_ARMV7_COUNTERS_MASK_CORTEX_A9;
-                       pmu_ctx.current_map = quadd_armv7_a9_events_map;
+                       local_pmu_ctx->current_map = quadd_armv7_a9_events_map;
                        pmu = &pmu_armv7_int;
                        break;
 
                case ARM_CPU_PART_CORTEX_A15:
-                       pmu_ctx.arch.type = QUADD_ARM_CPU_TYPE_CORTEX_A15;
-                       strncpy(pmu_ctx.arch.name, "Cortex A15",
-                               sizeof(pmu_ctx.arch.name));
+                       local_pmu_ctx->arch.type =
+                               QUADD_ARM_CPU_TYPE_CORTEX_A15;
 
-                       pmu_ctx.counters_mask =
+                       strncpy(local_pmu_ctx->arch.name, "Cortex A15",
+                               sizeof(local_pmu_ctx->arch.name));
+
+                       local_pmu_ctx->counters_mask =
                                QUADD_ARMV7_COUNTERS_MASK_CORTEX_A15;
-                       pmu_ctx.current_map = quadd_armv7_a15_events_map;
+                       local_pmu_ctx->current_map = quadd_armv7_a15_events_map;
                        pmu = &pmu_armv7_int;
                        break;
 
                default:
-                       pmu_ctx.arch.type = QUADD_ARM_CPU_TYPE_UNKNOWN;
-                       pmu_ctx.current_map = NULL;
+                       local_pmu_ctx->arch.type = QUADD_ARM_CPU_TYPE_UNKNOWN;
+                       local_pmu_ctx->current_map = NULL;
                        break;
                }
        }
 
-       INIT_LIST_HEAD(&pmu_ctx.used_events);
+       INIT_LIST_HEAD(&local_pmu_ctx->used_events);
 
-       pmu_ctx.arch.name[sizeof(pmu_ctx.arch.name) - 1] = '\0';
+       local_pmu_ctx->arch.name[sizeof(local_pmu_ctx->arch.name) - 1] = '\0';
        pr_info("arch: %s, type: %d, ver: %d\n",
-               pmu_ctx.arch.name, pmu_ctx.arch.type, pmu_ctx.arch.ver);
+               local_pmu_ctx->arch.name, local_pmu_ctx->arch.type,
+               local_pmu_ctx->arch.ver);
+
+       return pmu;
+}
+
+struct quadd_event_source_interface *quadd_armv7_pmu_init(void)
+{
+       struct quadd_event_source_interface *pmu = NULL;
+       int cpuid;
+       int err;
+       int initialized = 1;
+
+       for_each_possible_cpu(cpuid) {
+               err = quadd_armv7_pmu_init_for_cpu(cpuid);
+               if (err) {
+                       initialized = 0;
+                       break;
+               }
+       }
+
+       if (initialized == 1)
+               pmu = &pmu_armv7_int;
+       else
+               pr_err("error: incorrect PMUVer\n");
 
        return pmu;
 }
 
 void quadd_armv7_pmu_deinit(void)
 {
-       free_events(&pmu_ctx.used_events);
+       int cpuid;
+
+       for_each_possible_cpu(cpuid) {
+               struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
+               free_events(&local_pmu_ctx->used_events);
+       }
 }
index 2e618da..64e4971 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/armv8_pmu.c
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 
 #include <asm/cputype.h>
+#include <asm/cpu.h>
 
 #include "arm_pmu.h"
 #include "armv8_pmu.h"
@@ -48,7 +49,7 @@ struct quadd_cntrs_info {
 
 static DEFINE_PER_CPU(struct quadd_pmu_info, cpu_pmu_info);
 
-static struct quadd_pmu_ctx pmu_ctx;
+static DEFINE_PER_CPU(struct quadd_pmu_ctx, pmu_ctx);
 
 static unsigned
 quadd_armv8_pmuv3_arm_events_map[QUADD_EVENT_TYPE_MAX] = {
@@ -299,11 +300,13 @@ static void select_counter(unsigned int counter)
 
 static int is_pmu_enabled(void)
 {
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 pmcr = armv8_pmu_pmcr_read();
 
        if (pmcr & QUADD_ARMV8_PMCR_E) {
                u32 pmcnten = armv8_pmu_pmcntenset_read();
-               pmcnten &= pmu_ctx.counters_mask | QUADD_ARMV8_CCNT;
+
+               pmcnten &= local_pmu_ctx->counters_mask | QUADD_ARMV8_CCNT;
                return pmcnten ? 1 : 0;
        }
 
@@ -340,8 +343,10 @@ get_free_counters(unsigned long *bitmap, int nbits, int *ccntr)
        int cc;
        u32 cntens;
 
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+
        cntens = armv8_pmu_pmcntenset_read();
-       cntens = ~cntens & (pmu_ctx.counters_mask | QUADD_ARMV8_CCNT);
+       cntens = ~cntens & (local_pmu_ctx->counters_mask | QUADD_ARMV8_CCNT);
 
        bitmap_zero(bitmap, nbits);
        bitmap_copy(bitmap, (unsigned long *)&cntens,
@@ -364,14 +369,18 @@ disable_interrupt(int idx)
 static void
 disable_all_interrupts(void)
 {
-       u32 val = QUADD_ARMV8_CCNT | pmu_ctx.counters_mask;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+       u32 val = QUADD_ARMV8_CCNT | local_pmu_ctx->counters_mask;
+
        armv8_pmu_pmintenclr_write(val);
 }
 
 static void
 reset_overflow_flags(void)
 {
-       u32 val = QUADD_ARMV8_CCNT | pmu_ctx.counters_mask;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+       u32 val = QUADD_ARMV8_CCNT | local_pmu_ctx->counters_mask;
+
        armv8_pmu_pmovsclr_write(val);
 }
 
@@ -385,13 +394,16 @@ select_event(unsigned int idx, unsigned int event)
 static void disable_all_counters(void)
 {
        u32 val;
+       u32 masked;
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
 
        /* Disable all counters */
        val = armv8_pmu_pmcr_read();
        if (val & QUADD_ARMV8_PMCR_E)
                armv8_pmu_pmcr_write(val & ~QUADD_ARMV8_PMCR_E);
 
-       armv8_pmu_pmcntenclr_write(QUADD_ARMV8_CCNT | pmu_ctx.counters_mask);
+       masked = QUADD_ARMV8_CCNT | local_pmu_ctx->counters_mask;
+       armv8_pmu_pmcntenclr_write(masked);
 }
 
 static void enable_all_counters(void)
@@ -458,6 +470,7 @@ static void pmu_start(void)
 {
        int idx = 0, pcntrs, ccntr;
        u32 event;
+       struct quadd_pmu_ctx *local_pmu_ctx;
        DECLARE_BITMAP(free_bitmap, QUADD_MAX_PMU_COUNTERS);
        struct quadd_pmu_info *pi = &__get_cpu_var(cpu_pmu_info);
        u32 *prevp = pi->prev_vals;
@@ -476,7 +489,8 @@ static void pmu_start(void)
 
        pcntrs = get_free_counters(free_bitmap, QUADD_MAX_PMU_COUNTERS, &ccntr);
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                int index;
 
                *prevp++ = 0;
@@ -542,6 +556,7 @@ pmu_read(struct event_data *events, int max_events)
        u32 val;
        int idx = 0, i = 0;
        struct quadd_pmu_info *pi = &__get_cpu_var(cpu_pmu_info);
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
        u32 *prevp = pi->prev_vals;
        struct quadd_pmu_event_info *ei;
 
@@ -550,7 +565,7 @@ pmu_read(struct event_data *events, int max_events)
                return 0;
        }
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                int index;
 
                if (ei->quadd_event_id == QUADD_EVENT_TYPE_CPU_CYCLES) {
@@ -602,7 +617,9 @@ pmu_read_emulate(struct event_data *events, int max_events)
        u32 *prevp = pi->prev_vals;
        struct quadd_pmu_event_info *ei;
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       struct quadd_pmu_ctx *local_pmu_ctx = &__get_cpu_var(pmu_ctx);
+
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                if (val > 200)
                        val = 100;
 
@@ -653,20 +670,23 @@ static void free_events(struct list_head *head)
        }
 }
 
-static int set_events(int *events, int size)
+static int set_events(int cpuid, int *events, int size)
 {
        int free_pcntrs, err;
        int i, nr_l1_r = 0, nr_l1_w = 0;
        struct quadd_cntrs_info free_ci;
 
-       pmu_ctx.l1_cache_rw = 0;
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
 
-       free_events(&pmu_ctx.used_events);
+       local_pmu_ctx->l1_cache_rw = 0;
+
+       free_events(&local_pmu_ctx->used_events);
 
        if (!events || !size)
                return 0;
 
-       if (!pmu_ctx.current_map) {
+       if (!local_pmu_ctx->current_map) {
                pr_err("Invalid current_map\n");
                return -ENODEV;
        }
@@ -675,7 +695,7 @@ static int set_events(int *events, int size)
        free_ci.pcntrs = QUADD_MAX_PMU_COUNTERS;
        free_ci.ccntr = 1;
 
-       on_each_cpu(__get_free_counters, &free_ci, 1);
+       smp_call_function_single(cpuid, __get_free_counters, &free_ci, 1);
 
        free_pcntrs = free_ci.pcntrs;
        pr_info("free counters: pcntrs/ccntr: %d/%d\n",
@@ -700,7 +720,7 @@ static int set_events(int *events, int size)
                }
 
                INIT_LIST_HEAD(&ei->list);
-               list_add_tail(&ei->list, &pmu_ctx.used_events);
+               list_add_tail(&ei->list, &local_pmu_ctx->used_events);
 
                if (events[i] == QUADD_EVENT_TYPE_CPU_CYCLES) {
                        ei->hw_value = QUADD_ARMV8_CPU_CYCLE_EVENT;
@@ -716,7 +736,7 @@ static int set_events(int *events, int size)
                                goto out_free;
                        }
 
-                       ei->hw_value = pmu_ctx.current_map[events[i]];
+                       ei->hw_value = local_pmu_ctx->current_map[events[i]];
                }
 
                ei->quadd_event_id = events[i];
@@ -732,34 +752,41 @@ static int set_events(int *events, int size)
        }
 
        if (nr_l1_r > 0 && nr_l1_w > 0)
-               pmu_ctx.l1_cache_rw = 1;
+               local_pmu_ctx->l1_cache_rw = 1;
 
        return 0;
 
 out_free:
-       free_events(&pmu_ctx.used_events);
+       free_events(&local_pmu_ctx->used_events);
        return err;
 }
 
-static int get_supported_events(int *events, int max_events)
+static int get_supported_events(int cpuid, int *events, int max_events)
 {
        int i, nr_events = 0;
 
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
        max_events = min_t(int, QUADD_EVENT_TYPE_MAX, max_events);
 
        for (i = 0; i < max_events; i++) {
-               if (pmu_ctx.current_map[i] != QUADD_ARMV8_UNSUPPORTED_EVENT)
+               unsigned int event = local_pmu_ctx->current_map[i];
+
+               if (event != QUADD_ARMV8_UNSUPPORTED_EVENT)
                        events[nr_events++] = i;
        }
        return nr_events;
 }
 
-static int get_current_events(int *events, int max_events)
+static int get_current_events(int cpuid, int *events, int max_events)
 {
        int i = 0;
        struct quadd_pmu_event_info *ei;
 
-       list_for_each_entry(ei, &pmu_ctx.used_events, list) {
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
+
+       list_for_each_entry(ei, &local_pmu_ctx->used_events, list) {
                events[i++] = ei->quadd_event_id;
 
                if (i >= max_events)
@@ -769,9 +796,11 @@ static int get_current_events(int *events, int max_events)
        return i;
 }
 
-static struct quadd_arch_info *get_arch(void)
+static struct quadd_arch_info *get_arch(int cpuid)
 {
-       return &pmu_ctx.arch;
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+
+       return &local_pmu_ctx->arch;
 }
 
 static struct quadd_event_source_interface pmu_armv8_int = {
@@ -792,101 +821,150 @@ static struct quadd_event_source_interface pmu_armv8_int = {
        .get_arch               = get_arch,
 };
 
-struct quadd_event_source_interface *quadd_armv8_pmu_init(void)
+static int quadd_armv8_pmu_init_for_cpu(int cpuid)
 {
-       u32 pmcr, imp, idcode;
-       struct quadd_event_source_interface *pmu = NULL;
+       u32 pmcr;
+       u32 idcode = 0;
+       int err = 0;
+       int idx;
+       struct cpuinfo_arm64 *local_cpu_data = &per_cpu(cpu_data, cpuid);
+       struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpuid);
+       u32 reg_midr = local_cpu_data->reg_midr;
+       u32 ext_ver;
+       u64 aa64_dfr;
+
+       char implementer = (reg_midr >> 24) & 0xFF;
+
+       strncpy(local_pmu_ctx->arch.name, "Unknown",
+                       sizeof(local_pmu_ctx->arch.name));
+
+       local_pmu_ctx->arch.type = QUADD_AA64_CPU_TYPE_UNKNOWN;
+       local_pmu_ctx->arch.ver = 0;
+       local_pmu_ctx->current_map = NULL;
 
-       u64 aa64_dfr = read_cpuid(ID_AA64DFR0_EL1);
+       aa64_dfr = read_cpuid(ID_AA64DFR0_EL1);
        aa64_dfr = (aa64_dfr >> 8) & 0x0f;
 
-       strncpy(pmu_ctx.arch.name, "Unknown", sizeof(pmu_ctx.arch.name));
-       pmu_ctx.arch.type = QUADD_AA64_CPU_TYPE_UNKNOWN;
-       pmu_ctx.arch.ver = 0;
-       pmu_ctx.current_map = NULL;
+       if (aa64_dfr != QUADD_AA64_PMUVER_PMUV3)
+               err = 1;
 
-       switch (aa64_dfr) {
-       case QUADD_AA64_PMUVER_PMUV3:
-               strncpy(pmu_ctx.arch.name, "AA64 PmuV3",
-                       sizeof(pmu_ctx.arch.name));
-               pmu_ctx.arch.name[sizeof(pmu_ctx.arch.name) - 1] = '\0';
+       if (err == 0 && (implementer == 'A' || implementer == 'N')) {
 
-               pmu_ctx.counters_mask =
+               strncpy(local_pmu_ctx->arch.name, "AA64 PmuV3",
+                               sizeof(local_pmu_ctx->arch.name));
+
+               idx = sizeof(local_pmu_ctx->arch.name) - 1;
+               local_pmu_ctx->arch.name[idx] = '\0';
+
+               local_pmu_ctx->counters_mask =
                        QUADD_ARMV8_COUNTERS_MASK_PMUV3;
-               pmu_ctx.current_map =
+               local_pmu_ctx->current_map =
                        quadd_armv8_pmuv3_arm_events_map;
 
                pmcr = armv8_pmu_pmcr_read();
 
                idcode = (pmcr >> QUADD_ARMV8_PMCR_IDCODE_SHIFT) &
                        QUADD_ARMV8_PMCR_IDCODE_MASK;
-               imp = pmcr >> QUADD_ARMV8_PMCR_IMP_SHIFT;
 
-               pr_info("imp: %#x, idcode: %#x\n", imp, idcode);
+               pr_info("imp: %#x, idcode: %#x\n", implementer, idcode);
+       }
 
-               if (imp == ARM_CPU_IMP_ARM) {
-                       strncat(pmu_ctx.arch.name, " ARM",
-                               sizeof(pmu_ctx.arch.name) -
-                               strlen(pmu_ctx.arch.name));
-                       pmu_ctx.arch.name[sizeof(pmu_ctx.arch.name) - 1] = '\0';
+       if (err == 0) {
+               switch (implementer) {
+               case 'A':
+                       strncat(local_pmu_ctx->arch.name, " ARM",
+                               sizeof(local_pmu_ctx->arch.name) -
+                               strlen(local_pmu_ctx->arch.name));
+                       idx = sizeof(local_pmu_ctx->arch.name) - 1;
+                       local_pmu_ctx->arch.name[idx] = '\0';
 
                        if (idcode == QUADD_AA64_CPU_IDCODE_CORTEX_A53) {
-                               pmu_ctx.arch.type =
+                               local_pmu_ctx->arch.type =
                                        QUADD_AA64_CPU_TYPE_CORTEX_A53;
 
-                               strncat(pmu_ctx.arch.name, " CORTEX-A53",
-                                       sizeof(pmu_ctx.arch.name) -
-                                       strlen(pmu_ctx.arch.name));
+                               strncat(local_pmu_ctx->arch.name, " CORTEX-A53",
+                                       sizeof(local_pmu_ctx->arch.name) -
+                                       strlen(local_pmu_ctx->arch.name));
+
                        } else if (idcode == QUADD_AA64_CPU_IDCODE_CORTEX_A57) {
-                               pmu_ctx.arch.type =
+                               local_pmu_ctx->arch.type =
                                        QUADD_AA64_CPU_TYPE_CORTEX_A57;
-                               pmu_ctx.current_map =
+                               local_pmu_ctx->current_map =
                                        quadd_armv8_pmuv3_a57_events_map;
 
-                               strncat(pmu_ctx.arch.name, " CORTEX-A57",
-                                       sizeof(pmu_ctx.arch.name) -
-                                       strlen(pmu_ctx.arch.name));
+                               strncat(local_pmu_ctx->arch.name, " CORTEX-A57",
+                                       sizeof(local_pmu_ctx->arch.name) -
+                                       strlen(local_pmu_ctx->arch.name));
                        } else {
-                               pmu_ctx.arch.type = QUADD_AA64_CPU_TYPE_ARM;
+                               local_pmu_ctx->arch.type =
+                                       QUADD_AA64_CPU_TYPE_ARM;
                        }
-               } else if (imp == QUADD_AA64_CPU_IMP_NVIDIA) {
-                       u32 ext_ver = armv8_id_afr0_el1_read();
-                       ext_ver = (ext_ver >> QUADD_ARMV8_PMU_NVEXT_SHIFT) &
-                                 QUADD_ARMV8_PMU_NVEXT_MASK;
-
-                       strncat(pmu_ctx.arch.name, " NVIDIA (Denver)",
-                               sizeof(pmu_ctx.arch.name) -
-                               strlen(pmu_ctx.arch.name));
+                       break;
+               case 'N':
+                       ext_ver = armv8_id_afr0_el1_read();
 
-                       pmu_ctx.arch.type = QUADD_AA64_CPU_TYPE_DENVER;
-                       pmu_ctx.arch.ver = ext_ver;
-                       pmu_ctx.current_map =
+                       ext_ver = (ext_ver >> QUADD_ARMV8_PMU_NVEXT_SHIFT) &
+                               QUADD_ARMV8_PMU_NVEXT_MASK;
+
+                       strncat(local_pmu_ctx->arch.name, " NVIDIA (Denver)",
+                               sizeof(local_pmu_ctx->arch.name) -
+                               strlen(local_pmu_ctx->arch.name));
+                       local_pmu_ctx->arch.type = QUADD_AA64_CPU_TYPE_DENVER;
+                       local_pmu_ctx->arch.ver = ext_ver;
+                       local_pmu_ctx->current_map =
                                quadd_armv8_pmuv3_denver_events_map;
-               } else {
-                       strncat(pmu_ctx.arch.name, " Unknown implementor code",
-                               sizeof(pmu_ctx.arch.name) -
-                               strlen(pmu_ctx.arch.name));
-                       pmu_ctx.arch.type = QUADD_AA64_CPU_TYPE_UNKNOWN_IMP;
+                       break;
+               default:
+                       strncat(local_pmu_ctx->arch.name,
+                               " Unknown implementor code",
+                               sizeof(local_pmu_ctx->arch.name) -
+                               strlen(local_pmu_ctx->arch.name));
+                       local_pmu_ctx->arch.type =
+                               QUADD_AA64_CPU_TYPE_UNKNOWN_IMP;
+                       err = 1;
+                       break;
                }
+       }
 
-               pmu = &pmu_armv8_int;
-               break;
+       local_pmu_ctx->arch.name[sizeof(local_pmu_ctx->arch.name) - 1] = '\0';
+       pr_info("arch: %s, type: %d, ver: %d\n",
+               local_pmu_ctx->arch.name, local_pmu_ctx->arch.type,
+               local_pmu_ctx->arch.ver);
 
-       default:
-               pr_err("error: incorrect PMUVer\n");
-               break;
-       }
+       INIT_LIST_HEAD(&local_pmu_ctx->used_events);
+       return err;
+}
 
-       INIT_LIST_HEAD(&pmu_ctx.used_events);
+struct quadd_event_source_interface *quadd_armv8_pmu_init(void)
+{
+       struct quadd_event_source_interface *pmu = NULL;
+       int cpuid;
+       int err;
+       int initialized = 1;
+
+       for_each_possible_cpu(cpuid) {
+               err = quadd_armv8_pmu_init_for_cpu(cpuid);
+               if (err) {
+                       initialized = 0;
+                       break;
+               }
+       }
 
-       pmu_ctx.arch.name[sizeof(pmu_ctx.arch.name) - 1] = '\0';
-       pr_info("arch: %s, type: %d, ver: %d\n",
-               pmu_ctx.arch.name, pmu_ctx.arch.type, pmu_ctx.arch.ver);
+       if (initialized == 1)
+               pmu = &pmu_armv8_int;
+       else
+               pr_err("error: incorrect PMUVer\n");
 
        return pmu;
 }
 
 void quadd_armv8_pmu_deinit(void)
 {
-       free_events(&pmu_ctx.used_events);
+       int cpu_id;
+
+       for_each_possible_cpu(cpu_id) {
+               struct quadd_pmu_ctx *local_pmu_ctx = &per_cpu(pmu_ctx, cpu_id);
+
+               free_events(&local_pmu_ctx->used_events);
+       }
 }
index 80c8461..e8c2c3a 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/err.h>
 #include <linux/mm.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/tegra_profiler.h>
 
@@ -65,6 +65,7 @@ struct quadd_comm_ctx {
 
 struct comm_cpu_context {
        struct quadd_ring_buffer rb;
+       int params_ok;
 };
 
 static struct quadd_comm_ctx comm_ctx;
@@ -74,6 +75,7 @@ static int __maybe_unused
 rb_is_full(struct quadd_ring_buffer *rb)
 {
        struct quadd_ring_buffer_hdr *rb_hdr = rb->rb_hdr;
+
        return (rb_hdr->pos_write + 1) % rb_hdr->size == rb_hdr->pos_read;
 }
 
@@ -81,6 +83,7 @@ static int __maybe_unused
 rb_is_empty(struct quadd_ring_buffer *rb)
 {
        struct quadd_ring_buffer_hdr *rb_hdr = rb->rb_hdr;
+
        return rb_hdr->pos_read == rb_hdr->pos_write;
 }
 
@@ -256,6 +259,7 @@ find_mmap(unsigned long vm_start)
 
        list_for_each_entry(entry, &comm_ctx.mmap_areas, list) {
                struct vm_area_struct *mmap_vma = entry->mmap_vma;
+
                if (vm_start == mmap_vma->vm_start)
                        return entry;
        }
@@ -408,6 +412,37 @@ static void rb_reset(struct quadd_ring_buffer *rb)
        spin_unlock_irqrestore(&rb->lock, flags);
 }
 
+static int
+ready_to_profile(void)
+{
+       int cpuid;
+
+       if (!comm_ctx.params_ok)
+               return 0;
+
+       for_each_possible_cpu(cpuid) {
+               struct comm_cpu_context *cc = &per_cpu(cpu_ctx, cpuid);
+
+               if (!cc->params_ok)
+                       return 0;
+       }
+
+       return 1;
+}
+
+static void
+reset_params_ok_flag(void)
+{
+       int cpu_id;
+
+       comm_ctx.params_ok = 0;
+       for_each_possible_cpu(cpu_id) {
+               struct comm_cpu_context *cc = &per_cpu(cpu_ctx, cpu_id);
+
+               cc->params_ok = 0;
+       }
+}
+
 static long
 device_ioctl(struct file *file,
             unsigned int ioctl_num,
@@ -416,6 +451,8 @@ device_ioctl(struct file *file,
        int err = 0;
        struct quadd_mmap_area *mmap;
        struct quadd_parameters *user_params;
+       struct quadd_pmu_setup_for_cpu *cpu_pmu_params;
+       struct quadd_comm_cap_for_cpu *per_cpu_cap;
        struct quadd_comm_cap cap;
        struct quadd_module_state state;
        struct quadd_module_version versions;
@@ -427,14 +464,56 @@ device_ioctl(struct file *file,
        if (ioctl_num != IOCTL_SETUP &&
            ioctl_num != IOCTL_GET_CAP &&
            ioctl_num != IOCTL_GET_STATE &&
+           ioctl_num != IOCTL_SETUP_PMU_FOR_CPU &&
+           ioctl_num != IOCTL_GET_CAP_FOR_CPU &&
            ioctl_num != IOCTL_GET_VERSION) {
-               if (!comm_ctx.params_ok) {
+               if (!ready_to_profile()) {
                        err = -EACCES;
                        goto error_out;
                }
        }
 
        switch (ioctl_num) {
+       case IOCTL_SETUP_PMU_FOR_CPU:
+               if (atomic_read(&comm_ctx.active)) {
+                       pr_err("error: tegra profiler is active\n");
+                       err = -EBUSY;
+                       goto error_out;
+               }
+
+               cpu_pmu_params = vmalloc(sizeof(*cpu_pmu_params));
+               if (!cpu_pmu_params) {
+                       err = -ENOMEM;
+                       goto error_out;
+               }
+
+               if (copy_from_user(cpu_pmu_params,
+                                  (void __user *)ioctl_param,
+                                  sizeof(*cpu_pmu_params))) {
+                       pr_err("setup failed\n");
+                       vfree(cpu_pmu_params);
+                       err = -EFAULT;
+                       goto error_out;
+               }
+
+               per_cpu(cpu_ctx, cpu_pmu_params->cpuid).params_ok = 0;
+
+               err = comm_ctx.control->set_parameters_for_cpu(cpu_pmu_params);
+               if (err) {
+                       pr_err("error: setup failed\n");
+                       vfree(cpu_pmu_params);
+                       goto error_out;
+               }
+
+               per_cpu(cpu_ctx, cpu_pmu_params->cpuid).params_ok = 1;
+
+               pr_info("setup PMU success for cpu: %d\n",
+                       cpu_pmu_params->cpuid);
+
+               vfree(cpu_pmu_params);
+               break;
+
+
        case IOCTL_SETUP:
                if (atomic_read(&comm_ctx.active)) {
                        pr_err("error: tegra profiler is active\n");
@@ -492,6 +571,34 @@ device_ioctl(struct file *file,
                }
                break;
 
+       case IOCTL_GET_CAP_FOR_CPU:
+               per_cpu_cap = vmalloc(sizeof(*per_cpu_cap));
+               if (!per_cpu_cap) {
+                       err = -ENOMEM;
+                       goto error_out;
+               }
+
+               if (copy_from_user(per_cpu_cap, (void __user *)ioctl_param,
+                                  sizeof(*per_cpu_cap))) {
+                       pr_err("setup failed\n");
+                       vfree(per_cpu_cap);
+                       err = -EFAULT;
+                       goto error_out;
+               }
+               comm_ctx.control->get_capabilities_for_cpu(per_cpu_cap->cpuid,
+                                                          per_cpu_cap);
+
+               if (copy_to_user((void __user *)ioctl_param, per_cpu_cap,
+                                sizeof(*per_cpu_cap))) {
+                       pr_err("error: get_capabilities failed\n");
+                       vfree(per_cpu_cap);
+                       err = -EFAULT;
+                       goto error_out;
+               }
+
+               vfree(per_cpu_cap);
+               break;
+
        case IOCTL_GET_VERSION:
                strcpy((char *)versions.branch, QUADD_MODULE_BRANCH);
                strcpy((char *)versions.version, QUADD_MODULE_VERSION);
@@ -536,6 +643,7 @@ device_ioctl(struct file *file,
 
        case IOCTL_STOP:
                if (atomic_cmpxchg(&comm_ctx.active, 1, 0)) {
+                       reset_params_ok_flag();
                        comm_ctx.control->stop();
                        wake_up_all(&comm_ctx.read_wait);
                        rb_stop();
@@ -770,10 +878,8 @@ static int comm_init(void)
        struct miscdevice *misc_dev;
 
        misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL);
-       if (!misc_dev) {
-               pr_err("Error: alloc error\n");
+       if (!misc_dev)
                return -ENOMEM;
-       }
 
        misc_dev->minor = MISC_DYNAMIC_MINOR;
        misc_dev->name = QUADD_DEVICE_NAME;
@@ -790,7 +896,6 @@ static int comm_init(void)
        mutex_init(&comm_ctx.io_mutex);
        atomic_set(&comm_ctx.active, 0);
 
-       comm_ctx.params_ok = 0;
        comm_ctx.nr_users = 0;
 
        init_waitqueue_head(&comm_ctx.read_wait);
@@ -812,6 +917,8 @@ static int comm_init(void)
                spin_lock_init(&rb->lock);
        }
 
+       reset_params_ok_flag();
+
        return 0;
 }
 
index d04e04f..f2a6048 100644 (file)
@@ -55,7 +55,10 @@ struct quadd_comm_control_interface {
        int (*start)(void);
        void (*stop)(void);
        int (*set_parameters)(struct quadd_parameters *param);
+       int (*set_parameters_for_cpu)(struct quadd_pmu_setup_for_cpu *param);
        void (*get_capabilities)(struct quadd_comm_cap *cap);
+       void (*get_capabilities_for_cpu)(int cpuid,
+                                        struct quadd_comm_cap_for_cpu *cap);
        void (*get_state)(struct quadd_module_state *state);
        int (*set_extab)(struct quadd_sections *extabs,
                         struct quadd_mmap_area *mmap);
index ff62919..656438d 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/debug.h
  *
- * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2013-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -62,10 +62,10 @@ static inline void qm_debug_stop_source(int source_type)
 void quadd_test_delay(void);
 
 #define QM_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
-static inline char *
+static inline const char *
 quadd_get_event_str(int event)
 {
-       static char *str[] = {
+       static const char * const str[] = {
                [QUADD_EVENT_TYPE_CPU_CYCLES]           = "cpu-cycles",
 
                [QUADD_EVENT_TYPE_INSTRUCTIONS]         = "instructions",
index 9196e5b..6fb762b 100644 (file)
@@ -49,6 +49,7 @@ static long
 quadd_arm_imm(u32 val)
 {
        unsigned int rot = (val & 0xf00) >> 7, imm = (val & 0xff);
+
        return ((imm << (32 - rot)) | (imm >> rot)) & 0xffffffff;
 }
 
index 11dadb9..954ab27 100644 (file)
@@ -256,15 +256,14 @@ add_ex_region(struct regions_data *rd,
                        i_min = mid + 1;
        }
 
-       if (array[i_max].vm_start == new_entry->vm_start) {
+       if (array[i_max].vm_start == new_entry->vm_start)
                return 0;
-       } else {
-               memmove(array + i_max + 1,
-                       array + i_max,
-                       (size - i_max) * sizeof(*array));
-               memcpy(&array[i_max], new_entry, sizeof(*new_entry));
-               return 1;
-       }
+
+       memmove(array + i_max + 1,
+               array + i_max,
+               (size - i_max) * sizeof(*array));
+       memcpy(&array[i_max], new_entry, sizeof(*new_entry));
+       return 1;
 }
 
 static int
@@ -408,7 +407,7 @@ static struct regions_data *rd_alloc(unsigned long size)
        if (!rd)
                return NULL;
 
-       rd->entries = kzalloc(size * sizeof(*rd->entries), GFP_ATOMIC);
+       rd->entries = kcalloc(size, sizeof(*rd->entries), GFP_ATOMIC);
        if (!rd->entries) {
                kfree(rd);
                return NULL;
@@ -431,6 +430,7 @@ static void rd_free(struct regions_data *rd)
 static void rd_free_rcu(struct rcu_head *rh)
 {
        struct regions_data *rd = container_of(rh, struct regions_data, rcu);
+
        rd_free(rd);
 }
 
index 46aa502..3c78dcf 100644 (file)
@@ -152,19 +152,19 @@ quadd_put_sample(struct quadd_record_data *data,
        __put_sample(data, vec, vec_count, 0);
 }
 
-static void put_header(void)
+static void put_header(int cpuid)
 {
-       int cpu_id;
        int nr_events = 0, max_events = QUADD_MAX_COUNTERS;
        int events[QUADD_MAX_COUNTERS];
        struct quadd_record_data record;
        struct quadd_header_data *hdr = &record.hdr;
        struct quadd_parameters *param = &hrt.quadd_ctx->param;
        unsigned int extra = param->reserved[QUADD_PARAM_IDX_EXTRA];
-       struct quadd_iovec vec;
+       struct quadd_iovec vec[2];
        struct quadd_ctx *ctx = hrt.quadd_ctx;
        struct quadd_event_source_interface *pmu = ctx->pmu;
        struct quadd_event_source_interface *pl310 = ctx->pl310;
+       u32 cpuid_data = cpuid;
 
        record.record_type = QUADD_RECORD_TYPE_HEADER;
 
@@ -207,20 +207,26 @@ static void put_header(void)
        if (hrt.get_stack_offset)
                hdr->reserved |= QUADD_HDR_STACK_OFFSET;
 
+       hdr->reserved |= QUADD_HDR_HAS_CPUID;
+
        if (pmu)
-               nr_events += pmu->get_current_events(events, max_events);
+               nr_events += pmu->get_current_events(cpuid, events + nr_events,
+                                                    max_events - nr_events);
 
        if (pl310)
-               nr_events += pl310->get_current_events(events + nr_events,
+               nr_events += pl310->get_current_events(cpuid,
+                                                      events + nr_events,
                                                       max_events - nr_events);
 
        hdr->nr_events = nr_events;
 
-       vec.base = events;
-       vec.len = nr_events * sizeof(events[0]);
+       vec[0].base = events;
+       vec[0].len = nr_events * sizeof(events[0]);
 
-       for_each_possible_cpu(cpu_id)
-               __put_sample(&record, &vec, 1, cpu_id);
+       vec[1].base = &cpuid_data;
+       vec[1].len = sizeof(cpuid_data);
+
+       __put_sample(&record, &vec[0], 2, cpuid);
 }
 
 static void
@@ -305,6 +311,7 @@ static int read_source(struct quadd_event_source_interface *source,
 
                if (s->event_source == QUADD_EVENT_SOURCE_PL310) {
                        int nr_active = atomic_read(&hrt.nr_active_all_core);
+
                        if (nr_active > 1)
                                res_val /= nr_active;
                }
@@ -362,7 +369,7 @@ read_all_sources(struct pt_regs *regs, struct task_struct *task)
        if (task->flags & PF_EXITING)
                return;
 
-       if (ctx->pmu && ctx->pmu_info.active)
+       if (ctx->pmu && ctx->get_pmu_info()->active)
                nr_events += read_source(ctx->pmu, regs,
                                         events, QUADD_MAX_COUNTERS);
 
@@ -446,8 +453,10 @@ read_all_sources(struct pt_regs *regs, struct task_struct *task)
 
        if (hrt.get_stack_offset) {
                long offset = get_stack_offset(task, user_regs, cc);
+
                if (offset > 0) {
                        u32 off = offset >> 2;
+
                        off = min_t(u32, off, 0xffff);
                        extra_data |= off << QUADD_SED_STACK_OFFSET_SHIFT;
                }
@@ -458,6 +467,7 @@ read_all_sources(struct pt_regs *regs, struct task_struct *task)
        s->events_flags = 0;
        for (i = 0; i < nr_events; i++) {
                u32 value = events[i].value;
+
                if (value > 0) {
                        s->events_flags |= 1 << i;
                        events_extra[nr_positive_events++] = value;
@@ -647,6 +657,7 @@ static void reset_cpu_ctx(void)
 int quadd_hrt_start(void)
 {
        int err;
+       int cpuid;
        u64 period;
        long freq;
        unsigned int extra;
@@ -692,7 +703,8 @@ int quadd_hrt_start(void)
        hrt.get_stack_offset =
                (extra & QUADD_PARAM_EXTRA_STACK_OFFSET) ? 1 : 0;
 
-       put_header();
+       for_each_possible_cpu(cpuid)
+               put_header(cpuid);
 
        if (extra & QUADD_PARAM_EXTRA_GET_MMAP) {
                err = quadd_get_current_mmap(param->pids[0]);
index c696923..ddaf30c 100644 (file)
 #endif
 
 static struct quadd_ctx ctx;
+static DEFINE_PER_CPU(struct source_info, ctx_pmu_info);
+static DEFINE_PER_CPU(struct quadd_comm_cap_for_cpu, per_cpu_caps);
+
+static struct source_info *get_pmu_info_for_current_cpu(void)
+{
+       return &__get_cpu_var(ctx_pmu_info);
+}
+
+static struct quadd_comm_cap_for_cpu *get_capabilities_for_cpu_int(int cpuid)
+{
+       return &per_cpu(per_cpu_caps, cpuid);
+}
 
 static int get_default_properties(void)
 {
@@ -60,6 +72,8 @@ static int get_default_properties(void)
 
        ctx.param.pids[0] = 0;
        ctx.param.nr_pids = 1;
+       ctx.get_capabilities_for_cpu = get_capabilities_for_cpu_int;
+       ctx.get_pmu_info = get_pmu_info_for_current_cpu;
 
        return 0;
 }
@@ -178,13 +192,49 @@ validate_freq(unsigned int freq)
 }
 
 static int
+set_parameters_for_cpu(struct quadd_pmu_setup_for_cpu *params)
+{
+       int i;
+       int err;
+       int nr_pmu = 0;
+       int cpuid = params->cpuid;
+
+       struct source_info *pmu_info = &per_cpu(ctx_pmu_info, cpuid);
+       int pmu_events_id[QUADD_MAX_COUNTERS];
+
+       for (i = 0; i < params->nr_events; i++) {
+               int event = params->events[i];
+
+               if (pmu_info->nr_supported_events > 0
+                       && is_event_supported(pmu_info, event)) {
+                       pmu_events_id[nr_pmu++] = event;
+                       pr_info("PMU active event for cpu %d: %s\n",
+                                       cpuid,
+                                       quadd_get_event_str(event));
+               } else {
+                       pr_err("Bad event: %s\n",
+                              quadd_get_event_str(event));
+                       return -EINVAL;
+               }
+       }
+
+       err = ctx.pmu->set_events(cpuid, pmu_events_id, nr_pmu);
+       if (err) {
+               pr_err("PMU set parameters: error\n");
+               return err;
+       }
+       per_cpu(ctx_pmu_info, cpuid).active = 1;
+
+       return err;
+}
+
+static int
 set_parameters(struct quadd_parameters *p)
 {
        int i, err, uid = 0;
        uid_t task_uid, current_uid;
-       int pmu_events_id[QUADD_MAX_COUNTERS];
        int pl310_events_id;
-       int nr_pmu = 0, nr_pl310 = 0;
+       int nr_pl310 = 0;
        struct task_struct *task;
        u64 *low_addr_p;
 
@@ -240,15 +290,9 @@ set_parameters(struct quadd_parameters *p)
        for (i = 0; i < p->nr_events; i++) {
                int event = p->events[i];
 
-               if (ctx.pmu && ctx.pmu_info.nr_supported_events > 0
-                       && is_event_supported(&ctx.pmu_info, event)) {
-                       pmu_events_id[nr_pmu++] = p->events[i];
-
-                       pr_info("PMU active event: %s\n",
-                               quadd_get_event_str(event));
-               } else if (ctx.pl310 &&
-                          ctx.pl310_info.nr_supported_events > 0 &&
-                          is_event_supported(&ctx.pl310_info, event)) {
+               if (ctx.pl310 &&
+                   ctx.pl310_info.nr_supported_events > 0 &&
+                   is_event_supported(&ctx.pl310_info, event)) {
                        pl310_events_id = p->events[i];
 
                        pr_info("PL310 active event: %s\n",
@@ -265,23 +309,11 @@ set_parameters(struct quadd_parameters *p)
                }
        }
 
-       if (ctx.pmu) {
-               if (nr_pmu > 0) {
-                       err = ctx.pmu->set_events(pmu_events_id, nr_pmu);
-                       if (err) {
-                               pr_err("PMU set parameters: error\n");
-                               return err;
-                       }
-                       ctx.pmu_info.active = 1;
-               } else {
-                       ctx.pmu_info.active = 0;
-                       ctx.pmu->set_events(NULL, 0);
-               }
-       }
-
        if (ctx.pl310) {
+               int cpuid = 0; /* We don't need cpuid for pl310.  */
+
                if (nr_pl310 == 1) {
-                       err = ctx.pl310->set_events(&pl310_events_id, 1);
+                       err = ctx.pl310->set_events(cpuid, &pl310_events_id, 1);
                        if (err) {
                                pr_info("pl310 set_parameters: error\n");
                                return err;
@@ -289,7 +321,7 @@ set_parameters(struct quadd_parameters *p)
                        ctx.pl310_info.active = 1;
                } else {
                        ctx.pl310_info.active = 0;
-                       ctx.pl310->set_events(NULL, 0);
+                       ctx.pl310->set_events(cpuid, NULL, 0);
                }
        }
 
@@ -306,9 +338,90 @@ set_parameters(struct quadd_parameters *p)
        return 0;
 }
 
-static void get_capabilities(struct quadd_comm_cap *cap)
+static void
+get_capabilities_for_cpu(int cpuid, struct quadd_comm_cap_for_cpu *cap)
 {
-       int i, event;
+       int i;
+       struct quadd_events_cap *events_cap = &cap->events_cap;
+       struct source_info *s = &per_cpu(ctx_pmu_info, cpuid);
+
+       cap->cpuid = cpuid;
+       events_cap->cpu_cycles = 0;
+       events_cap->l1_dcache_read_misses = 0;
+       events_cap->l1_dcache_write_misses = 0;
+       events_cap->l1_icache_misses = 0;
+
+       events_cap->instructions = 0;
+       events_cap->branch_instructions = 0;
+       events_cap->branch_misses = 0;
+       events_cap->bus_cycles = 0;
+
+       events_cap->l2_dcache_read_misses = 0;
+       events_cap->l2_dcache_write_misses = 0;
+       events_cap->l2_icache_misses = 0;
+
+       for (i = 0; i < s->nr_supported_events; i++) {
+               int event = s->supported_events[i];
+
+               if (event == QUADD_EVENT_TYPE_L2_DCACHE_READ_MISSES ||
+                   event == QUADD_EVENT_TYPE_L2_DCACHE_WRITE_MISSES ||
+                   event == QUADD_EVENT_TYPE_L2_ICACHE_MISSES) {
+                       cap->l2_cache = 1;
+                       cap->l2_multiple_events = 1;
+                       break;
+               }
+
+
+               switch (event) {
+               case QUADD_EVENT_TYPE_CPU_CYCLES:
+                       events_cap->cpu_cycles = 1;
+                       break;
+               case QUADD_EVENT_TYPE_INSTRUCTIONS:
+                       events_cap->instructions = 1;
+                       break;
+               case QUADD_EVENT_TYPE_BRANCH_INSTRUCTIONS:
+                       events_cap->branch_instructions = 1;
+                       break;
+               case QUADD_EVENT_TYPE_BRANCH_MISSES:
+                       events_cap->branch_misses = 1;
+                       break;
+               case QUADD_EVENT_TYPE_BUS_CYCLES:
+                       events_cap->bus_cycles = 1;
+                       break;
+
+               case QUADD_EVENT_TYPE_L1_DCACHE_READ_MISSES:
+                       events_cap->l1_dcache_read_misses = 1;
+                       break;
+               case QUADD_EVENT_TYPE_L1_DCACHE_WRITE_MISSES:
+                       events_cap->l1_dcache_write_misses = 1;
+                       break;
+               case QUADD_EVENT_TYPE_L1_ICACHE_MISSES:
+                       events_cap->l1_icache_misses = 1;
+                       break;
+
+               case QUADD_EVENT_TYPE_L2_DCACHE_READ_MISSES:
+                       events_cap->l2_dcache_read_misses = 1;
+                       break;
+               case QUADD_EVENT_TYPE_L2_DCACHE_WRITE_MISSES:
+                       events_cap->l2_dcache_write_misses = 1;
+                       break;
+               case QUADD_EVENT_TYPE_L2_ICACHE_MISSES:
+                       events_cap->l2_icache_misses = 1;
+                       break;
+
+               default:
+                       pr_err_once("%s: error: invalid event\n",
+                                               __func__);
+                       return;
+               }
+       }
+}
+
+static void
+get_capabilities(struct quadd_comm_cap *cap)
+{
+       int i;
+       int cpuid;
        unsigned int extra = 0;
        struct quadd_events_cap *events_cap = &cap->events_cap;
 
@@ -318,18 +431,6 @@ static void get_capabilities(struct quadd_comm_cap *cap)
        if (ctx.pl310) {
                cap->l2_cache = 1;
                cap->l2_multiple_events = 0;
-       } else if (ctx.pmu) {
-               struct source_info *s = &ctx.pmu_info;
-               for (i = 0; i < s->nr_supported_events; i++) {
-                       event = s->supported_events[i];
-                       if (event == QUADD_EVENT_TYPE_L2_DCACHE_READ_MISSES ||
-                           event == QUADD_EVENT_TYPE_L2_DCACHE_WRITE_MISSES ||
-                           event == QUADD_EVENT_TYPE_L2_ICACHE_MISSES) {
-                               cap->l2_cache = 1;
-                               cap->l2_multiple_events = 1;
-                               break;
-                       }
-               }
        }
 
        events_cap->cpu_cycles = 0;
@@ -348,60 +449,11 @@ static void get_capabilities(struct quadd_comm_cap *cap)
 
        if (ctx.pl310) {
                struct source_info *s = &ctx.pl310_info;
-               for (i = 0; i < s->nr_supported_events; i++) {
-                       int event = s->supported_events[i];
-
-                       switch (event) {
-                       case QUADD_EVENT_TYPE_L2_DCACHE_READ_MISSES:
-                               events_cap->l2_dcache_read_misses = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_L2_DCACHE_WRITE_MISSES:
-                               events_cap->l2_dcache_write_misses = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_L2_ICACHE_MISSES:
-                               events_cap->l2_icache_misses = 1;
-                               break;
 
-                       default:
-                               pr_err_once("%s: error: invalid event\n",
-                                           __func__);
-                               return;
-                       }
-               }
-       }
-
-       if (ctx.pmu) {
-               struct source_info *s = &ctx.pmu_info;
                for (i = 0; i < s->nr_supported_events; i++) {
                        int event = s->supported_events[i];
 
                        switch (event) {
-                       case QUADD_EVENT_TYPE_CPU_CYCLES:
-                               events_cap->cpu_cycles = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_INSTRUCTIONS:
-                               events_cap->instructions = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_BRANCH_INSTRUCTIONS:
-                               events_cap->branch_instructions = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_BRANCH_MISSES:
-                               events_cap->branch_misses = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_BUS_CYCLES:
-                               events_cap->bus_cycles = 1;
-                               break;
-
-                       case QUADD_EVENT_TYPE_L1_DCACHE_READ_MISSES:
-                               events_cap->l1_dcache_read_misses = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_L1_DCACHE_WRITE_MISSES:
-                               events_cap->l1_dcache_write_misses = 1;
-                               break;
-                       case QUADD_EVENT_TYPE_L1_ICACHE_MISSES:
-                               events_cap->l1_icache_misses = 1;
-                               break;
-
                        case QUADD_EVENT_TYPE_L2_DCACHE_READ_MISSES:
                                events_cap->l2_dcache_read_misses = 1;
                                break;
@@ -433,11 +485,15 @@ static void get_capabilities(struct quadd_comm_cap *cap)
        extra |= QUADD_COMM_CAP_EXTRA_UNWIND_MIXED;
        extra |= QUADD_COMM_CAP_EXTRA_UNW_ENTRY_TYPE;
        extra |= QUADD_COMM_CAP_EXTRA_RB_MMAP_OP;
+       extra |= QUADD_COMM_CAP_EXTRA_CPU_MASK;
 
        if (ctx.hrt->tc)
                extra |= QUADD_COMM_CAP_EXTRA_ARCH_TIMER;
 
        cap->reserved[QUADD_COMM_CAP_IDX_EXTRA] = extra;
+       cap->reserved[QUADD_COMM_CAP_IDX_CPU_MASK] = 0;
+       for_each_possible_cpu(cpuid)
+               cap->reserved[QUADD_COMM_CAP_IDX_CPU_MASK] |= (1 << cpuid);
 }
 
 void quadd_get_state(struct quadd_module_state *state)
@@ -472,7 +528,9 @@ static struct quadd_comm_control_interface control = {
        .start                  = start,
        .stop                   = stop,
        .set_parameters         = set_parameters,
+       .set_parameters_for_cpu = set_parameters_for_cpu,
        .get_capabilities       = get_capabilities,
+       .get_capabilities_for_cpu = get_capabilities_for_cpu,
        .get_state              = quadd_get_state,
        .set_extab              = set_extab,
        .delete_mmap            = delete_mmap,
@@ -482,6 +540,7 @@ static int __init quadd_module_init(void)
 {
        int i, nr_events, err;
        int *events;
+       int cpuid;
 
        pr_info("Branch: %s\n", QUADD_MODULE_BRANCH);
        pr_info("Version: %s\n", QUADD_MODULE_VERSION);
@@ -497,7 +556,11 @@ static int __init quadd_module_init(void)
 
        get_default_properties();
 
-       ctx.pmu_info.active = 0;
+       for_each_possible_cpu(cpuid) {
+               struct source_info *pmu_info = &per_cpu(ctx_pmu_info, cpuid);
+
+               pmu_info->active = 1;
+       }
        ctx.pl310_info.active = 0;
 
 #ifdef CONFIG_ARM64
@@ -508,16 +571,24 @@ static int __init quadd_module_init(void)
        if (!ctx.pmu) {
                pr_err("PMU init failed\n");
                return -ENODEV;
-       } else {
-               events = ctx.pmu_info.supported_events;
-               nr_events = ctx.pmu->get_supported_events(events,
-                                                         QUADD_MAX_COUNTERS);
-               ctx.pmu_info.nr_supported_events = nr_events;
+       }
+
+       for_each_possible_cpu(cpuid) {
+               struct source_info *pmu_info = &per_cpu(ctx_pmu_info,
+                                                       cpuid);
+
+               events = pmu_info->supported_events;
+               nr_events =
+                   ctx.pmu->get_supported_events(cpuid, events,
+                                                 QUADD_MAX_COUNTERS);
 
-               pr_debug("PMU: amount of events: %d\n", nr_events);
+               pmu_info->nr_supported_events = nr_events;
+
+               pr_debug("CPU: %d PMU: amount of events: %d\n",
+                        cpuid, nr_events);
 
                for (i = 0; i < nr_events; i++)
-                       pr_debug("PMU event: %s\n",
+                       pr_debug("CPU: %d PMU event: %s\n", cpuid,
                                 quadd_get_event_str(events[i]));
        }
 
@@ -528,7 +599,7 @@ static int __init quadd_module_init(void)
 #endif
        if (ctx.pl310) {
                events = ctx.pl310_info.supported_events;
-               nr_events = ctx.pl310->get_supported_events(events,
+               nr_events = ctx.pl310->get_supported_events(0, events,
                                                            QUADD_MAX_COUNTERS);
                ctx.pl310_info.nr_supported_events = nr_events;
 
@@ -573,6 +644,10 @@ static int __init quadd_module_init(void)
        }
 
        get_capabilities(&ctx.cap);
+
+       for_each_possible_cpu(cpuid)
+               get_capabilities_for_cpu(cpuid, &per_cpu(per_cpu_caps, cpuid));
+
        quadd_proc_init(&ctx);
 
        return 0;
index 98d6b25..0172bb1 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/pl310.c
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -134,6 +134,7 @@ static u32 l2x0_read_perf_event(void)
 static void l2x0_clear_values(void)
 {
        int cpu_id;
+
        for (cpu_id = 0; cpu_id < nr_cpu_ids; cpu_id++)
                per_cpu(pl310_prev_val, cpu_id) = 0;
 }
@@ -206,7 +207,7 @@ l2x0_events_read(struct event_data *events, int max_events)
 }
 
 static int __maybe_unused
-l2x0_events_read_emulate(struct event_data *events, int max_events)
+l2x0_events_read_emulate(int cpuid, struct event_data *events, int max_events)
 {
        static u32 val;
 
@@ -229,7 +230,7 @@ l2x0_events_read_emulate(struct event_data *events, int max_events)
        return 1;
 }
 
-static int __maybe_unused l2x0_set_events(int *events, int size)
+static int __maybe_unused l2x0_set_events(int cpuid, int *events, int size)
 {
        if (!events || size == 0) {
                l2x0_ctx.l2x0_event_type = -1;
@@ -263,7 +264,7 @@ static int __maybe_unused l2x0_set_events(int *events, int size)
        return 0;
 }
 
-static int get_supported_events(int *events, int max_events)
+static int get_supported_events(int cpuid, int *events, int max_events)
 {
        if (max_events < 3)
                return 0;
@@ -275,7 +276,7 @@ static int get_supported_events(int *events, int max_events)
        return 3;
 }
 
-static int get_current_events(int *events, int max_events)
+static int get_current_events(int cpuid, int *events, int max_events)
 {
        if (max_events == 0)
                return 0;
index 0f5febf..b0d728b 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/quadd.h
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -33,10 +33,10 @@ struct quadd_event_source_interface {
        void (*start)(void);
        void (*stop)(void);
        int (*read)(struct event_data *events, int max_events);
-       int (*set_events)(int *events, int size);
-       int (*get_supported_events)(int *events, int max_events);
-       int (*get_current_events)(int *events, int max_events);
-       struct quadd_arch_info * (*get_arch)(void);
+       int (*set_events)(int cpuid, int *events, int size);
+       int (*get_supported_events)(int cpuid, int *events, int max_events);
+       int (*get_current_events)(int cpuid, int *events, int max_events);
+       struct quadd_arch_info * (*get_arch)(int cpuid);
 };
 
 struct source_info {
@@ -51,7 +51,8 @@ struct quadd_ctx {
        struct quadd_comm_cap cap;
 
        struct quadd_event_source_interface *pmu;
-       struct source_info pmu_info;
+       struct source_info * (*get_pmu_info)(void);
+       struct quadd_comm_cap_for_cpu * (*get_capabilities_for_cpu)(int cpuid);
 
        struct quadd_event_source_interface *pl310;
        struct source_info pl310_info;
index dde99e6..9d446a4 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/quadd_proc.c
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -56,13 +56,11 @@ static const struct file_operations version_proc_fops = {
 
 static int show_capabilities(struct seq_file *f, void *offset)
 {
+       int cpuid;
        struct quadd_comm_cap *cap = &ctx->cap;
-       struct quadd_events_cap *event = &cap->events_cap;
        unsigned int extra = cap->reserved[QUADD_COMM_CAP_IDX_EXTRA];
        struct quadd_arch_info *arch = NULL;
 
-       if (ctx->pmu)
-               arch = ctx->pmu->get_arch();
 
        seq_printf(f, "pmu:                                   %s\n",
                   YES_NO(cap->pmu));
@@ -71,13 +69,6 @@ static int show_capabilities(struct seq_file *f, void *offset)
        seq_printf(f, "power rate samples:                    %s\n",
                   YES_NO(cap->power_rate));
 
-       seq_printf(f, "l2 cache:                              %s\n",
-                  YES_NO(cap->l2_cache));
-       if (cap->l2_cache) {
-               seq_printf(f, "multiple l2 events:                    %s\n",
-                          YES_NO(cap->l2_multiple_events));
-       }
-
        seq_printf(f, "support polling mode:                  %s\n",
                   YES_NO(cap->blocked_read));
        seq_printf(f, "backtrace from the kernel ctx:         %s\n",
@@ -103,38 +94,55 @@ static int show_capabilities(struct seq_file *f, void *offset)
 
        seq_puts(f, "\n");
 
-       if (arch) {
-               seq_printf(f, "pmu arch:                              %s\n",
-                       arch->name);
-               seq_printf(f, "pmu arch version:                      %d\n",
-                       arch->ver);
+       if (ctx->pmu) {
+               for_each_possible_cpu(cpuid) {
+                       struct quadd_comm_cap_for_cpu *cpu_cap;
+                       struct quadd_events_cap *event;
+
+                       cpu_cap = ctx->get_capabilities_for_cpu(cpuid);
+                       event = &cpu_cap->events_cap;
+
+                       arch = ctx->pmu->get_arch(cpuid);
+                       seq_printf(f, "\nCPU %d\n", cpuid);
+                       if (arch) {
+                               seq_printf(f, "pmu arch:                  %s\n",
+                                                  arch->name);
+                               seq_printf(f, "pmu arch version:          %d\n",
+                                                  arch->ver);
+                       }
+                       seq_printf(f, "l2 cache:                  %s\n",
+                                  YES_NO(cpu_cap->l2_cache));
+                       if (cap->l2_cache) {
+                               seq_printf(f, "multiple l2 events:        %s\n",
+                                          YES_NO(cpu_cap->l2_multiple_events));
+                       }
+
+                       seq_puts(f, "  Supported events:\n");
+                       seq_printf(f, "  cpu_cycles:                     %s\n",
+                                  YES_NO(event->cpu_cycles));
+                       seq_printf(f, "  instructions:                   %s\n",
+                                  YES_NO(event->instructions));
+                       seq_printf(f, "  branch_instructions:            %s\n",
+                                  YES_NO(event->branch_instructions));
+                       seq_printf(f, "  branch_misses:                  %s\n",
+                                  YES_NO(event->branch_misses));
+                       seq_printf(f, "  bus_cycles:                     %s\n",
+                                  YES_NO(event->bus_cycles));
+                       seq_printf(f, "  l1_dcache_read_misses:          %s\n",
+                                  YES_NO(event->l1_dcache_read_misses));
+                       seq_printf(f, "  l1_dcache_write_misses:         %s\n",
+                                  YES_NO(event->l1_dcache_write_misses));
+                       seq_printf(f, "  l1_icache_misses:               %s\n",
+                                  YES_NO(event->l1_icache_misses));
+                       seq_printf(f, "  l2_dcache_read_misses:          %s\n",
+                                  YES_NO(event->l2_dcache_read_misses));
+                       seq_printf(f, "  l2_dcache_write_misses:         %s\n",
+                                  YES_NO(event->l2_dcache_write_misses));
+                       seq_printf(f, "  l2_icache_misses:               %s\n",
+                                  YES_NO(event->l2_icache_misses));
+               }
        }
 
-       seq_puts(f, "\n");
-       seq_puts(f, "Supported events:\n");
-       seq_printf(f, "cpu_cycles:                     %s\n",
-                  YES_NO(event->cpu_cycles));
-       seq_printf(f, "instructions:                   %s\n",
-                  YES_NO(event->instructions));
-       seq_printf(f, "branch_instructions:            %s\n",
-                  YES_NO(event->branch_instructions));
-       seq_printf(f, "branch_misses:                  %s\n",
-                  YES_NO(event->branch_misses));
-       seq_printf(f, "bus_cycles:                     %s\n",
-                  YES_NO(event->bus_cycles));
-       seq_printf(f, "l1_dcache_read_misses:          %s\n",
-                  YES_NO(event->l1_dcache_read_misses));
-       seq_printf(f, "l1_dcache_write_misses:         %s\n",
-                  YES_NO(event->l1_dcache_write_misses));
-       seq_printf(f, "l1_icache_misses:               %s\n",
-                  YES_NO(event->l1_icache_misses));
-       seq_printf(f, "l2_dcache_read_misses:          %s\n",
-                  YES_NO(event->l2_dcache_read_misses));
-       seq_printf(f, "l2_dcache_write_misses:         %s\n",
-                  YES_NO(event->l2_dcache_write_misses));
-       seq_printf(f, "l2_icache_misses:               %s\n",
-                  YES_NO(event->l2_icache_misses));
-
        return 0;
 }
 
index 1a253cf..2530912 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * drivers/misc/tegra-profiler/tegra.h
  *
- * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -18,7 +18,7 @@
 #define __QUADD_TEGRA_H
 
 #include <linux/smp.h>
-#include <asm/ptrace.h>
+#include <linux/ptrace.h>
 
 #ifdef CONFIG_TEGRA_CLUSTER_CONTROL
 #include <linux/io.h>
index 5005121..a189422 100644 (file)
@@ -18,7 +18,7 @@
 #ifndef __QUADD_VERSION_H
 #define __QUADD_VERSION_H
 
-#define QUADD_MODULE_VERSION           "1.104"
+#define QUADD_MODULE_VERSION           "1.105"
 #define QUADD_MODULE_BRANCH            "Dev"
 
 #endif /* __QUADD_VERSION_H */
index caf58e6..04a8d2e 100644 (file)
@@ -19,8 +19,8 @@
 
 #include <linux/ioctl.h>
 
-#define QUADD_SAMPLES_VERSION  34
-#define QUADD_IO_VERSION       18
+#define QUADD_SAMPLES_VERSION  35
+#define QUADD_IO_VERSION       19
 
 #define QUADD_IO_VERSION_DYNAMIC_RB            5
 #define QUADD_IO_VERSION_RB_MAX_FILL_COUNT     6
@@ -36,6 +36,7 @@
 #define QUADD_IO_VERSION_STACK_OFFSET          16
 #define QUADD_IO_VERSION_SECTIONS_INFO         17
 #define QUADD_IO_VERSION_UNW_METHODS_OPT       18
+#define QUADD_IO_VERSION_PER_CPU_SETUP          19
 
 #define QUADD_SAMPLE_VERSION_THUMB_MODE_FLAG   17
 #define QUADD_SAMPLE_VERSION_GROUP_SAMPLES     18
@@ -53,6 +54,7 @@
 #define QUADD_SAMPLE_VERSION_SCHED_TASK_STATE  32
 #define QUADD_SAMPLE_VERSION_URCS              33
 #define QUADD_SAMPLE_VERSION_HOTPLUG           34
+#define QUADD_SAMPLE_VERSION_PER_CPU_SETUP      35
 
 #define QUADD_MMAP_HEADER_VERSION              1
 
  */
 #define IOCTL_SET_SECTIONS_INFO _IOW(QUADD_IOCTL, 8, struct quadd_sections)
 
+/*
+ * Per CPU PMU setup
+ */
+#define IOCTL_SETUP_PMU_FOR_CPU _IOW(QUADD_IOCTL, 9,\
+                                    struct quadd_pmu_setup_for_cpu)
+
+/*
+ * Per CPU capabilities
+ */
+#define IOCTL_GET_CAP_FOR_CPU _IOWR(QUADD_IOCTL, 10,\
+                                   struct quadd_comm_cap_for_cpu)
+
+
+
 #define QUADD_CPUMODE_TEGRA_POWER_CLUSTER_LP   (1 << 29)       /* LP CPU */
 #define QUADD_CPUMODE_THUMB                    (1 << 30)       /* thumb mode */
 
@@ -331,6 +347,7 @@ struct quadd_debug_data {
 #define QUADD_HDR_USE_ARCH_TIMER       (1 << 3)
 #define QUADD_HDR_STACK_OFFSET         (1 << 4)
 #define QUADD_HDR_BT_DWARF             (1 << 5)
+#define QUADD_HDR_HAS_CPUID            (1 << 6)
 
 struct quadd_header_data {
        u16 magic;
@@ -387,6 +404,7 @@ enum {
 #define QUADD_PARAM_EXTRA_STACK_OFFSET         (1 << 5)
 #define QUADD_PARAM_EXTRA_BT_UT_CE             (1 << 6)
 #define QUADD_PARAM_EXTRA_BT_DWARF             (1 << 7)
+#define QUADD_PARAM_EXTRA_PER_PMU_SETUP                (1 << 8)
 
 struct quadd_parameters {
        u32 freq;
@@ -409,6 +427,14 @@ struct quadd_parameters {
        u32 reserved[16];       /* reserved fields for future extensions */
 };
 
+struct quadd_pmu_setup_for_cpu {
+       u32 cpuid;
+       u32 events[QUADD_MAX_COUNTERS];
+       u32 nr_events;
+
+       u32 reserved[16];
+};
+
 struct quadd_events_cap {
        u32     cpu_cycles:1,
                instructions:1,
@@ -427,6 +453,7 @@ struct quadd_events_cap {
 
 enum {
        QUADD_COMM_CAP_IDX_EXTRA = 0,
+       QUADD_COMM_CAP_IDX_CPU_MASK = 1,
 };
 
 #define QUADD_COMM_CAP_EXTRA_BT_KERNEL_CTX     (1 << 0)
@@ -439,6 +466,7 @@ enum {
 #define QUADD_COMM_CAP_EXTRA_UNW_ENTRY_TYPE    (1 << 7)
 #define QUADD_COMM_CAP_EXTRA_ARCH_TIMER                (1 << 8)
 #define QUADD_COMM_CAP_EXTRA_RB_MMAP_OP                (1 << 9)
+#define QUADD_COMM_CAP_EXTRA_CPU_MASK          (1 << 10)
 
 struct quadd_comm_cap {
        u32     pmu:1,
@@ -448,11 +476,18 @@ struct quadd_comm_cap {
                tegra_lp_cluster:1,
                blocked_read:1;
 
-       struct quadd_events_cap events_cap;
+       struct quadd_events_cap events_cap; /* Deprecated. */
 
        u32 reserved[16];       /* reserved fields for future extensions */
 };
 
+struct quadd_comm_cap_for_cpu {
+       u32     l2_cache:1,
+               l2_multiple_events:1;
+       int cpuid;
+       struct quadd_events_cap events_cap;
+};
+
 enum {
        QUADD_MOD_STATE_IDX_RB_MAX_FILL_COUNT = 0,
        QUADD_MOD_STATE_IDX_STATUS,