Merge branch 'linus' into perf/core

author Ingo Molnar <mingo@elte.hu>

Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)

committer Ingo Molnar <mingo@elte.hu>

Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)
author Ingo Molnar <mingo@elte.hu>
Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)
committer Ingo Molnar <mingo@elte.hu>
Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c

index 85d8e4f58c83ce612269162b9635bd49059c39dd..1cc49683fb69b2a5f96639e71a2f1af821479e77 100644 (file)
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -307,7 +307,7 @@ again:
                              new_raw_count) != prev_raw_count)
                 goto again;
  
-       delta = (new_raw_count  - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
+       delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
  
         /* It is possible on very rare occasions that the PMC has overflowed
          * but the interrupt is yet to come.  Detect and fix this situation.
@@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
                 struct hw_perf_event *hwc = &pe->hw;
                 int idx = hwc->idx;
  
-               if (cpuc->current_idx[j] != PMC_NO_INDEX) {
-                       cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
-                       continue;
+               if (cpuc->current_idx[j] == PMC_NO_INDEX) {
+                       alpha_perf_event_set_period(pe, hwc, idx);
+                       cpuc->current_idx[j] = idx;
                 }
  
-               alpha_perf_event_set_period(pe, hwc, idx);
-               cpuc->current_idx[j] = idx;
-               cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
+               if (!(hwc->state & PERF_HES_STOPPED))
+                       cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
         }
         cpuc->config = cpuc->event[0]->hw.config_base;
  }
@@ -420,12 +419,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
   *  - this function is called from outside this module via the pmu struct
   *    returned from perf event initialisation.
   */
-static int alpha_pmu_enable(struct perf_event *event)
+static int alpha_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
         int n0;
         int ret;
-       unsigned long flags;
+       unsigned long irq_flags;
  
         /*
          * The Sparc code has the IRQ disable first followed by the perf
@@ -435,8 +435,8 @@ static int alpha_pmu_enable(struct perf_event *event)
          * nevertheless we disable the PMCs first to enable a potential
          * final PMI to occur before we disable interrupts.
          */
-       perf_disable();
-       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+       local_irq_save(irq_flags);
  
         /* Default to error to be returned */
         ret = -EAGAIN;
@@ -455,8 +455,12 @@ static int alpha_pmu_enable(struct perf_event *event)
                 }
         }
  
-       local_irq_restore(flags);
-       perf_enable();
+       hwc->state = PERF_HES_UPTODATE;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_STOPPED;
+
+       local_irq_restore(irq_flags);
+       perf_pmu_enable(event->pmu);
  
         return ret;
  }
@@ -467,15 +471,15 @@ static int alpha_pmu_enable(struct perf_event *event)
   *  - this function is called from outside this module via the pmu struct
   *    returned from perf event initialisation.
   */
-static void alpha_pmu_disable(struct perf_event *event)
+static void alpha_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
+       unsigned long irq_flags;
         int j;
  
-       perf_disable();
-       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+       local_irq_save(irq_flags);
  
         for (j = 0; j < cpuc->n_events; j++) {
                 if (event == cpuc->event[j]) {
@@ -501,8 +505,8 @@ static void alpha_pmu_disable(struct perf_event *event)
                 }
         }
  
-       local_irq_restore(flags);
-       perf_enable();
+       local_irq_restore(irq_flags);
+       perf_pmu_enable(event->pmu);
  }
  
  
@@ -514,13 +518,44 @@ static void alpha_pmu_read(struct perf_event *event)
  }
  
  
-static void alpha_pmu_unthrottle(struct perf_event *event)
+static void alpha_pmu_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               cpuc->idx_mask &= ~(1UL<<hwc->idx);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               alpha_perf_event_update(event, hwc, hwc->idx, 0);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+
+       if (cpuc->enabled)
+               wrperfmon(PERFMON_CMD_DISABLE, (1UL<<hwc->idx));
+}
+
+
+static void alpha_pmu_start(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+               alpha_perf_event_set_period(event, hwc, hwc->idx);
+       }
+
+       hwc->state = 0;
+
         cpuc->idx_mask |= 1UL<<hwc->idx;
-       wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
+       if (cpuc->enabled)
+               wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
  }
  
  
@@ -642,39 +677,36 @@ static int __hw_perf_event_init(struct perf_event *event)
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = alpha_pmu_enable,
-       .disable        = alpha_pmu_disable,
-       .read           = alpha_pmu_read,
-       .unthrottle     = alpha_pmu_unthrottle,
-};
-
-
  /*
   * Main entry point to initialise a HW performance event.
   */
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int alpha_pmu_event_init(struct perf_event *event)
  {
         int err;
  
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (!alpha_pmu)
-               return ERR_PTR(-ENODEV);
+               return -ENODEV;
  
         /* Do the real initialisation work. */
         err = __hw_perf_event_init(event);
  
-       if (err)
-               return ERR_PTR(err);
-
-       return &pmu;
+       return err;
  }
  
-
-
  /*
   * Main entry point - enable HW performance counters.
   */
-void hw_perf_enable(void)
+static void alpha_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -700,7 +732,7 @@ void hw_perf_enable(void)
   * Main entry point - disable HW performance counters.
   */
  
-void hw_perf_disable(void)
+static void alpha_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -713,6 +745,17 @@ void hw_perf_disable(void)
         wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = alpha_pmu_enable,
+       .pmu_disable    = alpha_pmu_disable,
+       .event_init     = alpha_pmu_event_init,
+       .add            = alpha_pmu_add,
+       .del            = alpha_pmu_del,
+       .start          = alpha_pmu_start,
+       .stop           = alpha_pmu_stop,
+       .read           = alpha_pmu_read,
+};
+
  
  /*
   * Main entry point - don't know when this is called but it
@@ -766,7 +809,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
         wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
  
         /* la_ptr is the counter that overflowed. */
-       if (unlikely(la_ptr >= perf_max_events)) {
+       if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) {
                 /* This should never occur! */
                 irq_err_count++;
                 pr_warning("PMI: silly index %ld\n", la_ptr);
@@ -807,7 +850,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
                         /* Interrupts coming too quickly; "throttle" the
                          * counter, i.e., disable it for a little while.
                          */
-                       cpuc->idx_mask &= ~(1UL<<idx);
+                       alpha_pmu_stop(event, 0);
                 }
         }
         wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask);
@@ -837,6 +880,7 @@ void __init init_hw_perf_events(void)
  
         /* And set up PMU specification */
         alpha_pmu = &ev67_pmu;
-       perf_max_events = alpha_pmu->num_pmcs;
+
+       perf_pmu_register(&pmu);
  }
  
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c

index ecbb0288e5dd95c80b420635dffee6ef600f86a8..ad19c276b10fa1b5d7f82a8f5c2169560d9ebdd0 100644 (file)
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -221,46 +221,56 @@ again:
  }
  
  static void
-armpmu_disable(struct perf_event *event)
+armpmu_read(struct perf_event *event)
  {
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       int idx = hwc->idx;
-
-       WARN_ON(idx < 0);
-
-       clear_bit(idx, cpuc->active_mask);
-       armpmu->disable(hwc, idx);
-
-       barrier();
  
-       armpmu_event_update(event, hwc, idx);
-       cpuc->events[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
+       /* Don't read disabled counters! */
+       if (hwc->idx < 0)
+               return;
  
-       perf_event_update_userpage(event);
+       armpmu_event_update(event, hwc, hwc->idx);
  }
  
  static void
-armpmu_read(struct perf_event *event)
+armpmu_stop(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
  
-       /* Don't read disabled counters! */
-       if (hwc->idx < 0)
+       if (!armpmu)
                 return;
  
-       armpmu_event_update(event, hwc, hwc->idx);
+       /*
+        * ARM pmu always has to update the counter, so ignore
+        * PERF_EF_UPDATE, see comments in armpmu_start().
+        */
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               armpmu->disable(hwc, hwc->idx);
+               barrier(); /* why? */
+               armpmu_event_update(event, hwc, hwc->idx);
+               hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       }
  }
  
  static void
-armpmu_unthrottle(struct perf_event *event)
+armpmu_start(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
  
+       if (!armpmu)
+               return;
+
+       /*
+        * ARM pmu always has to reprogram the period, so ignore
+        * PERF_EF_RELOAD, see the comment below.
+        */
+       if (flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+       hwc->state = 0;
         /*
          * Set the period again. Some counters can't be stopped, so when we
-        * were throttled we simply disabled the IRQ source and the counter
+        * were stopped we simply disabled the IRQ source and the counter
          * may have been left counting. If we don't do this step then we may
          * get an interrupt too soon or *way* too late if the overflow has
          * happened since disabling.
@@ -269,14 +279,33 @@ armpmu_unthrottle(struct perf_event *event)
         armpmu->enable(hwc, hwc->idx);
  }
  
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       WARN_ON(idx < 0);
+
+       clear_bit(idx, cpuc->active_mask);
+       armpmu_stop(event, PERF_EF_UPDATE);
+       cpuc->events[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
+
+       perf_event_update_userpage(event);
+}
+
  static int
-armpmu_enable(struct perf_event *event)
+armpmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx;
         int err = 0;
  
+       perf_pmu_disable(event->pmu);
+
         /* If we don't have a space for the counter then finish early. */
         idx = armpmu->get_event_idx(cpuc, hwc);
         if (idx < 0) {
@@ -293,25 +322,19 @@ armpmu_enable(struct perf_event *event)
         cpuc->events[idx] = event;
         set_bit(idx, cpuc->active_mask);
  
-       /* Set the period for the event. */
-       armpmu_event_set_period(event, hwc, idx);
-
-       /* Enable the event. */
-       armpmu->enable(hwc, idx);
+       hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       if (flags & PERF_EF_START)
+               armpmu_start(event, PERF_EF_RELOAD);
  
         /* Propagate our changes to the userspace mapping. */
         perf_event_update_userpage(event);
  
  out:
+       perf_pmu_enable(event->pmu);
         return err;
  }
  
-static struct pmu pmu = {
-       .enable     = armpmu_enable,
-       .disable    = armpmu_disable,
-       .unthrottle = armpmu_unthrottle,
-       .read       = armpmu_read,
-};
+static struct pmu pmu;
  
  static int
  validate_event(struct cpu_hw_events *cpuc,
@@ -491,20 +514,29 @@ __hw_perf_event_init(struct perf_event *event)
         return err;
  }
  
-const struct pmu *
-hw_perf_event_init(struct perf_event *event)
+static int armpmu_event_init(struct perf_event *event)
  {
         int err = 0;
  
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (!armpmu)
-               return ERR_PTR(-ENODEV);
+               return -ENODEV;
  
         event->destroy = hw_perf_event_destroy;
  
         if (!atomic_inc_not_zero(&active_events)) {
-               if (atomic_read(&active_events) > perf_max_events) {
+               if (atomic_read(&active_events) > armpmu.num_events) {
                         atomic_dec(&active_events);
-                       return ERR_PTR(-ENOSPC);
+                       return -ENOSPC;
                 }
  
                 mutex_lock(&pmu_reserve_mutex);
@@ -518,17 +550,16 @@ hw_perf_event_init(struct perf_event *event)
         }
  
         if (err)
-               return ERR_PTR(err);
+               return err;
  
         err = __hw_perf_event_init(event);
         if (err)
                 hw_perf_event_destroy(event);
  
-       return err ? ERR_PTR(err) : &pmu;
+       return err;
  }
  
-void
-hw_perf_enable(void)
+static void armpmu_enable(struct pmu *pmu)
  {
         /* Enable all of the perf events on hardware. */
         int idx;
@@ -549,13 +580,23 @@ hw_perf_enable(void)
         armpmu->start();
  }
  
-void
-hw_perf_disable(void)
+static void armpmu_disable(struct pmu *pmu)
  {
         if (armpmu)
                 armpmu->stop();
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = armpmu_enable,
+       .pmu_disable    = armpmu_disable,
+       .event_init     = armpmu_event_init,
+       .add            = armpmu_add,
+       .del            = armpmu_del,
+       .start          = armpmu_start,
+       .stop           = armpmu_stop,
+       .read           = armpmu_read,
+};
+
  /*
   * ARMv6 Performance counter handling code.
   *
@@ -2933,14 +2974,12 @@ init_hw_perf_events(void)
                         armpmu = &armv6pmu;
                         memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
                                         sizeof(armv6_perf_cache_map));
-                       perf_max_events = armv6pmu.num_events;
                         break;
                 case 0xB020:    /* ARM11mpcore */
                         armpmu = &armv6mpcore_pmu;
                         memcpy(armpmu_perf_cache_map,
                                armv6mpcore_perf_cache_map,
                                sizeof(armv6mpcore_perf_cache_map));
-                       perf_max_events = armv6mpcore_pmu.num_events;
                         break;
                 case 0xC080:    /* Cortex-A8 */
                         armv7pmu.id = ARM_PERF_PMU_ID_CA8;
@@ -2952,7 +2991,6 @@ init_hw_perf_events(void)
                         /* Reset PMNC and read the nb of CNTx counters
                             supported */
                         armv7pmu.num_events = armv7_reset_read_pmnc();
-                       perf_max_events = armv7pmu.num_events;
                         break;
                 case 0xC090:    /* Cortex-A9 */
                         armv7pmu.id = ARM_PERF_PMU_ID_CA9;
@@ -2964,7 +3002,6 @@ init_hw_perf_events(void)
                         /* Reset PMNC and read the nb of CNTx counters
                             supported */
                         armv7pmu.num_events = armv7_reset_read_pmnc();
-                       perf_max_events = armv7pmu.num_events;
                         break;
                 }
         /* Intel CPUs [xscale]. */
@@ -2975,13 +3012,11 @@ init_hw_perf_events(void)
                         armpmu = &xscale1pmu;
                         memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
                                         sizeof(xscale_perf_cache_map));
-                       perf_max_events = xscale1pmu.num_events;
                         break;
                 case 2:
                         armpmu = &xscale2pmu;
                         memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
                                         sizeof(xscale_perf_cache_map));
-                       perf_max_events = xscale2pmu.num_events;
                         break;
                 }
         }
@@ -2991,9 +3026,10 @@ init_hw_perf_events(void)
                                 arm_pmu_names[armpmu->id], armpmu->num_events);
         } else {
                 pr_info("no hardware support available\n");
-               perf_max_events = -1;
         }
  
+       perf_pmu_register(&pmu);
+
         return 0;
  }
  arch_initcall(init_hw_perf_events);
@@ -3001,13 +3037,6 @@ arch_initcall(init_hw_perf_events);
  /*
   * Callchain handling code.
   */
-static inline void
-callchain_store(struct perf_callchain_entry *entry,
-               u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
  
  /*
   * The registers we're interested in are at the end of the variable
@@ -3039,7 +3068,7 @@ user_backtrace(struct frame_tail *tail,
         if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
                 return NULL;
  
-       callchain_store(entry, buftail.lr);
+       perf_callchain_store(entry, buftail.lr);
  
         /*
          * Frame pointers should strictly progress back up the stack
@@ -3051,16 +3080,11 @@ user_backtrace(struct frame_tail *tail,
         return buftail.fp - 1;
  }
  
-static void
-perf_callchain_user(struct pt_regs *regs,
-                   struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct frame_tail *tail;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
  
         tail = (struct frame_tail *)regs->ARM_fp - 1;
  
@@ -3078,56 +3102,18 @@ callchain_trace(struct stackframe *fr,
                 void *data)
  {
         struct perf_callchain_entry *entry = data;
-       callchain_store(entry, fr->pc);
+       perf_callchain_store(entry, fr->pc);
         return 0;
  }
  
-static void
-perf_callchain_kernel(struct pt_regs *regs,
-                     struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stackframe fr;
  
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
         fr.fp = regs->ARM_fp;
         fr.sp = regs->ARM_sp;
         fr.lr = regs->ARM_lr;
         fr.pc = regs->ARM_pc;
         walk_stackframe(&fr, callchain_trace, entry);
  }
-
-static void
-perf_do_callchain(struct pt_regs *regs,
-                 struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (!current || !current->pid)
-               return;
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-
-struct perf_callchain_entry *
-perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-       perf_do_callchain(regs, entry);
-       return entry;
-}
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c

index 0691176899ffc24f0a176d154a34f5b7200c6047..aad63e611b36a90bd156c8e1a0caed99cd2182be 100644 (file)
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -96,7 +96,7 @@ static int op_create_counter(int cpu, int event)
                 return ret;
  
         pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
-                                                 cpu, -1,
+                                                 cpu, NULL,
                                                   op_overflow_handler);
  
         if (IS_ERR(pevent)) {
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c

index 95ad9dad298e9d4773117b0406bc4a3378d77e5e..d05ae4204bbf3d3ddcc84266476b736a6790715c 100644 (file)
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -23,18 +23,6 @@
  #include "ppc32.h"
  #endif
  
-/*
- * Store another value in a callchain_entry.
- */
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       unsigned int nr = entry->nr;
-
-       if (nr < PERF_MAX_STACK_DEPTH) {
-               entry->ip[nr] = ip;
-               entry->nr = nr + 1;
-       }
-}
  
  /*
   * Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
         return 0;
  }
  
-static void perf_callchain_kernel(struct pt_regs *regs,
-                                 struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         unsigned long sp, next_sp;
         unsigned long next_ip;
@@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
  
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->nip);
+       perf_callchain_store(entry, regs->nip);
  
         if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
                 return;
@@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         next_ip = regs->nip;
                         lr = regs->link;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_KERNEL);
+                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
  
                 } else {
                         if (level == 0)
@@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         ++level;
                 }
  
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 if (!valid_next_sp(next_sp, sp))
                         return;
                 sp = next_sp;
@@ -233,8 +220,8 @@ static int sane_signal_64_frame(unsigned long sp)
                 puc == (unsigned long) &sf->uc;
  }
  
-static void perf_callchain_user_64(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long sp, next_sp;
         unsigned long next_ip;
@@ -246,8 +233,7 @@ static void perf_callchain_user_64(struct pt_regs *regs,
         next_ip = regs->nip;
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, next_ip);
+       perf_callchain_store(entry, next_ip);
  
         for (;;) {
                 fp = (unsigned long __user *) sp;
@@ -276,14 +262,14 @@ static void perf_callchain_user_64(struct pt_regs *regs,
                             read_user_stack_64(&uregs[PT_R1], &sp))
                                 return;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_USER);
-                       callchain_store(entry, next_ip);
+                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
                         continue;
                 }
  
                 if (level == 0)
                         next_ip = lr;
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 ++level;
                 sp = next_sp;
         }
@@ -315,8 +301,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
         return __get_user_inatomic(*ret, ptr);
  }
  
-static inline void perf_callchain_user_64(struct pt_regs *regs,
-                                         struct perf_callchain_entry *entry)
+static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                         struct pt_regs *regs)
  {
  }
  
@@ -435,8 +421,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
         return mctx->mc_gregs;
  }
  
-static void perf_callchain_user_32(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned int sp, next_sp;
         unsigned int next_ip;
@@ -447,8 +433,7 @@ static void perf_callchain_user_32(struct pt_regs *regs,
         next_ip = regs->nip;
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, next_ip);
+       perf_callchain_store(entry, next_ip);
  
         while (entry->nr < PERF_MAX_STACK_DEPTH) {
                 fp = (unsigned int __user *) (unsigned long) sp;
@@ -470,45 +455,24 @@ static void perf_callchain_user_32(struct pt_regs *regs,
                             read_user_stack_32(&uregs[PT_R1], &sp))
                                 return;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_USER);
-                       callchain_store(entry, next_ip);
+                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
                         continue;
                 }
  
                 if (level == 0)
                         next_ip = lr;
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 ++level;
                 sp = next_sp;
         }
  }
  
-/*
- * Since we can't get PMU interrupts inside a PMU interrupt handler,
- * we don't need separate irq and nmi entries here.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain);
-
-       entry->nr = 0;
-
-       if (!user_mode(regs)) {
-               perf_callchain_kernel(regs, entry);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               if (current_is_64bit())
-                       perf_callchain_user_64(regs, entry);
-               else
-                       perf_callchain_user_32(regs, entry);
-       }
-
-       return entry;
+       if (current_is_64bit())
+               perf_callchain_user_64(entry, regs);
+       else
+               perf_callchain_user_32(entry, regs);
  }
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c

index d301a30445e09a49cec4a3d4dcf2ea01529934b3..9cb4924b6c07837a47718bb1c842eb40a96df7b8 100644 (file)
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event)
  {
         s64 val, delta, prev;
  
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         if (!event->hw.idx)
                 return;
         /*
@@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
   * Disable all events to prevent PMU interrupts and to allow
   * events to be added or removed.
   */
-void hw_perf_disable(void)
+static void power_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -565,7 +568,7 @@ void hw_perf_disable(void)
   * If we were previously disabled and events were added, then
   * put the new config on the PMU.
   */
-void hw_perf_enable(void)
+static void power_pmu_enable(struct pmu *pmu)
  {
         struct perf_event *event;
         struct cpu_hw_events *cpuhw;
@@ -672,6 +675,8 @@ void hw_perf_enable(void)
                 }
                 local64_set(&event->hw.prev_count, val);
                 event->hw.idx = idx;
+               if (event->hw.state & PERF_HES_STOPPED)
+                       val = 0;
                 write_pmc(idx, val);
                 perf_event_update_userpage(event);
         }
@@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count,
   * re-enable the PMU in order to get hw_perf_enable to do the
   * actual work of reconfiguring the PMU.
   */
-static int power_pmu_enable(struct perf_event *event)
+static int power_pmu_add(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -735,7 +740,7 @@ static int power_pmu_enable(struct perf_event *event)
         int ret = -EAGAIN;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         /*
          * Add the event to the list (if there is room)
@@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event)
         cpuhw->events[n0] = event->hw.config;
         cpuhw->flags[n0] = event->hw.event_base;
  
+       if (!(ef_flags & PERF_EF_START))
+               event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
@@ -769,7 +777,7 @@ nocheck:
  
         ret = 0;
   out:
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
         return ret;
  }
@@ -777,14 +785,14 @@ nocheck:
  /*
   * Remove a event from the PMU.
   */
-static void power_pmu_disable(struct perf_event *event)
+static void power_pmu_del(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuhw;
         long i;
         unsigned long flags;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         power_pmu_read(event);
  
@@ -821,34 +829,60 @@ static void power_pmu_disable(struct perf_event *event)
                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
         }
  
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
  /*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
+ * POWER-PMU does not support disabling individual counters, hence
+ * program their cycle counter to their max value and ignore the interrupts.
   */
-static void power_pmu_unthrottle(struct perf_event *event)
+
+static void power_pmu_start(struct perf_event *event, int ef_flags)
+{
+       unsigned long flags;
+       s64 left;
+
+       if (!event->hw.idx || !event->hw.sample_period)
+               return;
+
+       if (!(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       if (ef_flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+
+       event->hw.state = 0;
+       left = local64_read(&event->hw.period_left);
+       write_pmc(event->hw.idx, left);
+
+       perf_event_update_userpage(event);
+       perf_pmu_enable(event->pmu);
+       local_irq_restore(flags);
+}
+
+static void power_pmu_stop(struct perf_event *event, int ef_flags)
  {
-       s64 val, left;
         unsigned long flags;
  
         if (!event->hw.idx || !event->hw.sample_period)
                 return;
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
+
         power_pmu_read(event);
-       left = event->hw.sample_period;
-       event->hw.last_period = left;
-       val = 0;
-       if (left < 0x80000000L)
-               val = 0x80000000L - left;
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
+       event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       write_pmc(event->hw.idx, 0);
+
         perf_event_update_userpage(event);
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
@@ -857,10 +891,11 @@ static void power_pmu_unthrottle(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-void power_pmu_start_txn(const struct pmu *pmu)
+void power_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuhw->group_flag |= PERF_EVENT_TXN;
         cpuhw->n_txn_start = cpuhw->n_events;
  }
@@ -870,11 +905,12 @@ void power_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-void power_pmu_cancel_txn(const struct pmu *pmu)
+void power_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -882,7 +918,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-int power_pmu_commit_txn(const struct pmu *pmu)
+int power_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         long i, n;
@@ -901,19 +937,10 @@ int power_pmu_commit_txn(const struct pmu *pmu)
                 cpuhw->event[i]->hw.config = cpuhw->events[i];
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-struct pmu power_pmu = {
-       .enable         = power_pmu_enable,
-       .disable        = power_pmu_disable,
-       .read           = power_pmu_read,
-       .unthrottle     = power_pmu_unthrottle,
-       .start_txn      = power_pmu_start_txn,
-       .cancel_txn     = power_pmu_cancel_txn,
-       .commit_txn     = power_pmu_commit_txn,
-};
-
  /*
   * Return 1 if we might be able to put event on a limited PMC,
   * or 0 if not.
@@ -1014,7 +1041,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
         return 0;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int power_pmu_event_init(struct perf_event *event)
  {
         u64 ev;
         unsigned long flags;
@@ -1026,25 +1053,27 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         struct cpu_hw_events *cpuhw;
  
         if (!ppmu)
-               return ERR_PTR(-ENXIO);
+               return -ENOENT;
+
         switch (event->attr.type) {
         case PERF_TYPE_HARDWARE:
                 ev = event->attr.config;
                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-                       return ERR_PTR(-EOPNOTSUPP);
+                       return -EOPNOTSUPP;
                 ev = ppmu->generic_events[ev];
                 break;
         case PERF_TYPE_HW_CACHE:
                 err = hw_perf_cache_event(event->attr.config, &ev);
                 if (err)
-                       return ERR_PTR(err);
+                       return err;
                 break;
         case PERF_TYPE_RAW:
                 ev = event->attr.config;
                 break;
         default:
-               return ERR_PTR(-EINVAL);
+               return -ENOENT;
         }
+
         event->hw.config_base = ev;
         event->hw.idx = 0;
  
@@ -1081,7 +1110,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                          */
                         ev = normal_pmc_alternative(ev, flags);
                         if (!ev)
-                               return ERR_PTR(-EINVAL);
+                               return -EINVAL;
                 }
         }
  
@@ -1095,19 +1124,19 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 n = collect_events(event->group_leader, ppmu->n_counter - 1,
                                    ctrs, events, cflags);
                 if (n < 0)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
         events[n] = ev;
         ctrs[n] = event;
         cflags[n] = flags;
         if (check_excludes(ctrs, cflags, n, 1))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         cpuhw = &get_cpu_var(cpu_hw_events);
         err = power_check_constraints(cpuhw, events, cflags, n + 1);
         put_cpu_var(cpu_hw_events);
         if (err)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         event->hw.config = events[n];
         event->hw.event_base = cflags[n];
@@ -1132,11 +1161,23 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         }
         event->destroy = hw_perf_event_destroy;
  
-       if (err)
-               return ERR_PTR(err);
-       return &power_pmu;
+       return err;
  }
  
+struct pmu power_pmu = {
+       .pmu_enable     = power_pmu_enable,
+       .pmu_disable    = power_pmu_disable,
+       .event_init     = power_pmu_event_init,
+       .add            = power_pmu_add,
+       .del            = power_pmu_del,
+       .start          = power_pmu_start,
+       .stop           = power_pmu_stop,
+       .read           = power_pmu_read,
+       .start_txn      = power_pmu_start_txn,
+       .cancel_txn     = power_pmu_cancel_txn,
+       .commit_txn     = power_pmu_commit_txn,
+};
+
  /*
   * A counter has overflowed; update its count and record
   * things if requested.  Note that interrupts are hard-disabled
@@ -1149,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         s64 prev, delta, left;
         int record = 0;
  
+       if (event->hw.state & PERF_HES_STOPPED) {
+               write_pmc(event->hw.idx, 0);
+               return;
+       }
+
         /* we don't have to worry about interrupts here */
         prev = local64_read(&event->hw.prev_count);
         delta = (val - prev) & 0xfffffffful;
@@ -1171,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                         val = 0x80000000LL - left;
         }
  
+       write_pmc(event->hw.idx, val);
+       local64_set(&event->hw.prev_count, val);
+       local64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+
         /*
          * Finally record data if requested.
          */
@@ -1183,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                 if (event->attr.sample_type & PERF_SAMPLE_ADDR)
                         perf_get_data_addr(regs, &data.addr);
  
-               if (perf_event_overflow(event, nmi, &data, regs)) {
-                       /*
-                        * Interrupts are coming too fast - throttle them
-                        * by setting the event to 0, so it will be
-                        * at least 2^30 cycles until the next interrupt
-                        * (assuming each event counts at most 2 counts
-                        * per cycle).
-                        */
-                       val = 0;
-                       left = ~0ULL >> 1;
-               }
+               if (perf_event_overflow(event, nmi, &data, regs))
+                       power_pmu_stop(event, 0);
         }
-
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
-       perf_event_update_userpage(event);
  }
  
  /*
@@ -1342,6 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
                 freeze_events_kernel = MMCR0_FCHV;
  #endif /* CONFIG_PPC64 */
  
+       perf_pmu_register(&power_pmu);
         perf_cpu_notifier(power_pmu_notifier);
  
         return 0;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c

index 1ba45471ae436617e1ecbf3654a5064ef15d1af7..7ecca59ddf77fe20bd46b470d9392cdd16fd5ba9 100644 (file)
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event)
  {
         s64 val, delta, prev;
  
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         /*
          * Performance monitor interrupts come even when interrupts
          * are soft-disabled, as long as interrupts are hard-enabled.
@@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
   * Disable all events to prevent PMU interrupts and to allow
   * events to be added or removed.
   */
-void hw_perf_disable(void)
+static void fsl_emb_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -216,7 +219,7 @@ void hw_perf_disable(void)
   * If we were previously disabled and events were added, then
   * put the new config on the PMU.
   */
-void hw_perf_enable(void)
+static void fsl_emb_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -262,8 +265,8 @@ static int collect_events(struct perf_event *group, int max_count,
         return n;
  }
  
-/* perf must be disabled, context locked on entry */
-static int fsl_emb_pmu_enable(struct perf_event *event)
+/* context locked on entry */
+static int fsl_emb_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuhw;
         int ret = -EAGAIN;
@@ -271,6 +274,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
         u64 val;
         int i;
  
+       perf_pmu_disable(event->pmu);
         cpuhw = &get_cpu_var(cpu_hw_events);
  
         if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -301,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
                         val = 0x80000000L - left;
         }
         local64_set(&event->hw.prev_count, val);
+
+       if (!(flags & PERF_EF_START)) {
+               event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+               val = 0;
+       }
+
         write_pmc(i, val);
         perf_event_update_userpage(event);
  
@@ -310,15 +320,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
         ret = 0;
   out:
         put_cpu_var(cpu_hw_events);
+       perf_pmu_enable(event->pmu);
         return ret;
  }
  
-/* perf must be disabled, context locked on entry */
-static void fsl_emb_pmu_disable(struct perf_event *event)
+/* context locked on entry */
+static void fsl_emb_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuhw;
         int i = event->hw.idx;
  
+       perf_pmu_disable(event->pmu);
         if (i < 0)
                 goto out;
  
@@ -346,44 +358,57 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
         cpuhw->n_events--;
  
   out:
+       perf_pmu_enable(event->pmu);
         put_cpu_var(cpu_hw_events);
  }
  
-/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
- *
- * Context is locked on entry, but perf is not disabled.
- */
-static void fsl_emb_pmu_unthrottle(struct perf_event *event)
+static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
  {
-       s64 val, left;
         unsigned long flags;
+       s64 left;
  
         if (event->hw.idx < 0 || !event->hw.sample_period)
                 return;
+
+       if (!(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       if (ef_flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
         local_irq_save(flags);
-       perf_disable();
-       fsl_emb_pmu_read(event);
-       left = event->hw.sample_period;
-       event->hw.last_period = left;
-       val = 0;
-       if (left < 0x80000000L)
-               val = 0x80000000L - left;
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
+       perf_pmu_disable(event->pmu);
+
+       event->hw.state = 0;
+       left = local64_read(&event->hw.period_left);
+       write_pmc(event->hw.idx, left);
+
         perf_event_update_userpage(event);
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
-static struct pmu fsl_emb_pmu = {
-       .enable         = fsl_emb_pmu_enable,
-       .disable        = fsl_emb_pmu_disable,
-       .read           = fsl_emb_pmu_read,
-       .unthrottle     = fsl_emb_pmu_unthrottle,
-};
+static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
+{
+       unsigned long flags;
+
+       if (event->hw.idx < 0 || !event->hw.sample_period)
+               return;
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+
+       fsl_emb_pmu_read(event);
+       event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       write_pmc(event->hw.idx, 0);
+
+       perf_event_update_userpage(event);
+       perf_pmu_enable(event->pmu);
+       local_irq_restore(flags);
+}
  
  /*
   * Release the PMU if this is the last perf_event.
@@ -428,7 +453,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
         return 0;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int fsl_emb_pmu_event_init(struct perf_event *event)
  {
         u64 ev;
         struct perf_event *events[MAX_HWEVENTS];
@@ -441,14 +466,14 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         case PERF_TYPE_HARDWARE:
                 ev = event->attr.config;
                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-                       return ERR_PTR(-EOPNOTSUPP);
+                       return -EOPNOTSUPP;
                 ev = ppmu->generic_events[ev];
                 break;
  
         case PERF_TYPE_HW_CACHE:
                 err = hw_perf_cache_event(event->attr.config, &ev);
                 if (err)
-                       return ERR_PTR(err);
+                       return err;
                 break;
  
         case PERF_TYPE_RAW:
@@ -456,12 +481,12 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 break;
  
         default:
-               return ERR_PTR(-EINVAL);
+               return -ENOENT;
         }
  
         event->hw.config = ppmu->xlate_event(ev);
         if (!(event->hw.config & FSL_EMB_EVENT_VALID))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         /*
          * If this is in a group, check if it can go on with all the
@@ -473,7 +498,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 n = collect_events(event->group_leader,
                                    ppmu->n_counter - 1, events);
                 if (n < 0)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
  
         if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
@@ -484,7 +509,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 }
  
                 if (num_restricted >= ppmu->n_restricted)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
  
         event->hw.idx = -1;
@@ -497,7 +522,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (event->attr.exclude_kernel)
                 event->hw.config_base |= PMLCA_FCS;
         if (event->attr.exclude_idle)
-               return ERR_PTR(-ENOTSUPP);
+               return -ENOTSUPP;
  
         event->hw.last_period = event->hw.sample_period;
         local64_set(&event->hw.period_left, event->hw.last_period);
@@ -523,11 +548,20 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         }
         event->destroy = hw_perf_event_destroy;
  
-       if (err)
-               return ERR_PTR(err);
-       return &fsl_emb_pmu;
+       return err;
  }
  
+static struct pmu fsl_emb_pmu = {
+       .pmu_enable     = fsl_emb_pmu_enable,
+       .pmu_disable    = fsl_emb_pmu_disable,
+       .event_init     = fsl_emb_pmu_event_init,
+       .add            = fsl_emb_pmu_add,
+       .del            = fsl_emb_pmu_del,
+       .start          = fsl_emb_pmu_start,
+       .stop           = fsl_emb_pmu_stop,
+       .read           = fsl_emb_pmu_read,
+};
+
  /*
   * A counter has overflowed; update its count and record
   * things if requested.  Note that interrupts are hard-disabled
@@ -540,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         s64 prev, delta, left;
         int record = 0;
  
+       if (event->hw.state & PERF_HES_STOPPED) {
+               write_pmc(event->hw.idx, 0);
+               return;
+       }
+
         /* we don't have to worry about interrupts here */
         prev = local64_read(&event->hw.prev_count);
         delta = (val - prev) & 0xfffffffful;
@@ -562,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                         val = 0x80000000LL - left;
         }
  
+       write_pmc(event->hw.idx, val);
+       local64_set(&event->hw.prev_count, val);
+       local64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+
         /*
          * Finally record data if requested.
          */
@@ -571,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                 perf_sample_data_init(&data, 0);
                 data.period = event->hw.last_period;
  
-               if (perf_event_overflow(event, nmi, &data, regs)) {
-                       /*
-                        * Interrupts are coming too fast - throttle them
-                        * by setting the event to 0, so it will be
-                        * at least 2^30 cycles until the next interrupt
-                        * (assuming each event counts at most 2 counts
-                        * per cycle).
-                        */
-                       val = 0;
-                       left = ~0ULL >> 1;
-               }
+               if (perf_event_overflow(event, nmi, &data, regs))
+                       fsl_emb_pmu_stop(event, 0);
         }
-
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
-       perf_event_update_userpage(event);
  }
  
  static void perf_event_interrupt(struct pt_regs *regs)
@@ -651,5 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
         pr_info("%s performance monitor hardware support registered\n",
                 pmu->name);
  
+       perf_pmu_register(&fsl_emb_pmu);
+
         return 0;
  }
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c

index a9dd3abde28e3f45bbd7d7654e8717c13aed8f34..d5ca1ef50fa9694a1942a8c304bd2c5aa69d9388 100644 (file)
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -14,11 +14,6 @@
  #include <asm/unwinder.h>
  #include <asm/ptrace.h>
  
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
  
  static void callchain_warning(void *data, char *msg)
  {
@@ -39,7 +34,7 @@ static void callchain_address(void *data, unsigned long addr, int reliable)
         struct perf_callchain_entry *entry = data;
  
         if (reliable)
-               callchain_store(entry, addr);
+               perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops callchain_ops = {
@@ -49,47 +44,10 @@ static const struct stacktrace_ops callchain_ops = {
         .address        = callchain_address,
  };
  
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->pc);
+       perf_callchain_store(entry, regs->pc);
  
         unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
  }
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       /*
-        * Only the kernel side is implemented for now.
-        */
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-}
-
-/*
- * No need for separate IRQ and NMI entries.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c

index 7a3dc356725839f2cf8579491efd8d02ba11b483..036f7a9296fabe96b8c33156791ea3a051da683e 100644 (file)
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -206,50 +206,80 @@ again:
         local64_add(delta, &event->count);
  }
  
-static void sh_pmu_disable(struct perf_event *event)
+static void sh_pmu_stop(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx = hwc->idx;
  
-       clear_bit(idx, cpuc->active_mask);
-       sh_pmu->disable(hwc, idx);
+       if (!(event->hw.state & PERF_HES_STOPPED)) {
+               sh_pmu->disable(hwc, idx);
+               cpuc->events[idx] = NULL;
+               event->hw.state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
+               sh_perf_event_update(event, &event->hw, idx);
+               event->hw.state |= PERF_HES_UPTODATE;
+       }
+}
  
-       barrier();
+static void sh_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
  
-       sh_perf_event_update(event, &event->hw, idx);
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+       cpuc->events[idx] = event;
+       event->hw.state = 0;
+       sh_pmu->enable(hwc, idx);
+}
+
+static void sh_pmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
-       cpuc->events[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
+       sh_pmu_stop(event, PERF_EF_UPDATE);
+       __clear_bit(event->hw.idx, cpuc->used_mask);
  
         perf_event_update_userpage(event);
  }
  
-static int sh_pmu_enable(struct perf_event *event)
+static int sh_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx = hwc->idx;
+       int ret = -EAGAIN;
+
+       perf_pmu_disable(event->pmu);
  
-       if (test_and_set_bit(idx, cpuc->used_mask)) {
+       if (__test_and_set_bit(idx, cpuc->used_mask)) {
                 idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
                 if (idx == sh_pmu->num_events)
-                       return -EAGAIN;
+                       goto out;
  
-               set_bit(idx, cpuc->used_mask);
+               __set_bit(idx, cpuc->used_mask);
                 hwc->idx = idx;
         }
  
         sh_pmu->disable(hwc, idx);
  
-       cpuc->events[idx] = event;
-       set_bit(idx, cpuc->active_mask);
-
-       sh_pmu->enable(hwc, idx);
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (flags & PERF_EF_START)
+               sh_pmu_start(event, PERF_EF_RELOAD);
  
         perf_event_update_userpage(event);
-
-       return 0;
+       ret = 0;
+out:
+       perf_pmu_enable(event->pmu);
+       return ret;
  }
  
  static void sh_pmu_read(struct perf_event *event)
@@ -257,24 +287,56 @@ static void sh_pmu_read(struct perf_event *event)
         sh_perf_event_update(event, &event->hw, event->hw.idx);
  }
  
-static const struct pmu pmu = {
-       .enable         = sh_pmu_enable,
-       .disable        = sh_pmu_disable,
-       .read           = sh_pmu_read,
-};
-
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int sh_pmu_event_init(struct perf_event *event)
  {
-       int err = __hw_perf_event_init(event);
+       int err;
+
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HW_CACHE:
+       case PERF_TYPE_HARDWARE:
+               err = __hw_perf_event_init(event);
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (unlikely(err)) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
+}
+
+static void sh_pmu_enable(struct pmu *pmu)
+{
+       if (!sh_pmu_initialized())
+               return;
+
+       sh_pmu->enable_all();
+}
+
+static void sh_pmu_disable(struct pmu *pmu)
+{
+       if (!sh_pmu_initialized())
+               return;
+
+       sh_pmu->disable_all();
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = sh_pmu_enable,
+       .pmu_disable    = sh_pmu_disable,
+       .event_init     = sh_pmu_event_init,
+       .add            = sh_pmu_add,
+       .del            = sh_pmu_del,
+       .start          = sh_pmu_start,
+       .stop           = sh_pmu_stop,
+       .read           = sh_pmu_read,
+};
+
  static void sh_pmu_setup(int cpu)
  {
         struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
@@ -299,32 +361,17 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-void hw_perf_enable(void)
-{
-       if (!sh_pmu_initialized())
-               return;
-
-       sh_pmu->enable_all();
-}
-
-void hw_perf_disable(void)
-{
-       if (!sh_pmu_initialized())
-               return;
-
-       sh_pmu->disable_all();
-}
-
-int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
  {
         if (sh_pmu)
                 return -EBUSY;
-       sh_pmu = pmu;
+       sh_pmu = _pmu;
  
-       pr_info("Performance Events: %s support registered\n", pmu->name);
+       pr_info("Performance Events: %s support registered\n", _pmu->name);
  
-       WARN_ON(pmu->num_events > MAX_HWEVENTS);
+       WARN_ON(_pmu->num_events > MAX_HWEVENTS);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(sh_pmu_notifier);
         return 0;
  }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c

index 6318e622cfb065d9da81e40b766c12dc0a6e2e7b..0d6deb55a2ae7e4189b5ab60aec81cd8df28adb6 100644 (file)
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
  
                 enc = perf_event_get_enc(cpuc->events[i]);
                 pcr &= ~mask_for_index(idx);
-               pcr |= event_encoding(enc, idx);
+               if (hwc->state & PERF_HES_STOPPED)
+                       pcr |= nop_for_index(idx);
+               else
+                       pcr |= event_encoding(enc, idx);
         }
  out:
         return pcr;
  }
  
-void hw_perf_enable(void)
+static void sparc_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         u64 pcr;
@@ -691,7 +694,7 @@ void hw_perf_enable(void)
         pcr_ops->write(cpuc->pcr);
  }
  
-void hw_perf_disable(void)
+static void sparc_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         u64 val;
@@ -710,19 +713,65 @@ void hw_perf_disable(void)
         pcr_ops->write(cpuc->pcr);
  }
  
-static void sparc_pmu_disable(struct perf_event *event)
+static int active_event_index(struct cpu_hw_events *cpuc,
+                             struct perf_event *event)
+{
+       int i;
+
+       for (i = 0; i < cpuc->n_events; i++) {
+               if (cpuc->event[i] == event)
+                       break;
+       }
+       BUG_ON(i == cpuc->n_events);
+       return cpuc->current_idx[i];
+}
+
+static void sparc_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx = active_event_index(cpuc, event);
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               sparc_perf_event_set_period(event, &event->hw, idx);
+       }
+
+       event->hw.state = 0;
+
+       sparc_pmu_enable_event(cpuc, &event->hw, idx);
+}
+
+static void sparc_pmu_stop(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx = active_event_index(cpuc, event);
+
+       if (!(event->hw.state & PERF_HES_STOPPED)) {
+               sparc_pmu_disable_event(cpuc, &event->hw, idx);
+               event->hw.state |= PERF_HES_STOPPED;
+       }
+
+       if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) {
+               sparc_perf_event_update(event, &event->hw, idx);
+               event->hw.state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void sparc_pmu_del(struct perf_event *event, int _flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
         unsigned long flags;
         int i;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         for (i = 0; i < cpuc->n_events; i++) {
                 if (event == cpuc->event[i]) {
-                       int idx = cpuc->current_idx[i];
+                       /* Absorb the final count and turn off the
+                        * event.
+                        */
+                       sparc_pmu_stop(event, PERF_EF_UPDATE);
  
                         /* Shift remaining entries down into
                          * the existing slot.
@@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event)
                                         cpuc->current_idx[i];
                         }
  
-                       /* Absorb the final count and turn off the
-                        * event.
-                        */
-                       sparc_pmu_disable_event(cpuc, hwc, idx);
-                       barrier();
-                       sparc_perf_event_update(event, hwc, idx);
-
                         perf_event_update_userpage(event);
  
                         cpuc->n_events--;
@@ -748,23 +790,10 @@ static void sparc_pmu_disable(struct perf_event *event)
                 }
         }
  
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
-static int active_event_index(struct cpu_hw_events *cpuc,
-                             struct perf_event *event)
-{
-       int i;
-
-       for (i = 0; i < cpuc->n_events; i++) {
-               if (cpuc->event[i] == event)
-                       break;
-       }
-       BUG_ON(i == cpuc->n_events);
-       return cpuc->current_idx[i];
-}
-
  static void sparc_pmu_read(struct perf_event *event)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event)
         sparc_perf_event_update(event, hwc, idx);
  }
  
-static void sparc_pmu_unthrottle(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       int idx = active_event_index(cpuc, event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       sparc_pmu_enable_event(cpuc, hwc, idx);
-}
-
  static atomic_t active_events = ATOMIC_INIT(0);
  static DEFINE_MUTEX(pmc_grab_mutex);
  
@@ -877,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts,
         if (!n_ev)
                 return 0;
  
-       if (n_ev > perf_max_events)
+       if (n_ev > MAX_HWEVENTS)
                 return -1;
  
         msk0 = perf_event_get_msk(events[0]);
@@ -984,23 +1004,27 @@ static int collect_events(struct perf_event *group, int max_count,
         return n;
  }
  
-static int sparc_pmu_enable(struct perf_event *event)
+static int sparc_pmu_add(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int n0, ret = -EAGAIN;
         unsigned long flags;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         n0 = cpuc->n_events;
-       if (n0 >= perf_max_events)
+       if (n0 >= MAX_HWEVENTS)
                 goto out;
  
         cpuc->event[n0] = event;
         cpuc->events[n0] = event->hw.event_base;
         cpuc->current_idx[n0] = PIC_NO_INDEX;
  
+       event->hw.state = PERF_HES_UPTODATE;
+       if (!(ef_flags & PERF_EF_START))
+               event->hw.state |= PERF_HES_STOPPED;
+
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
@@ -1020,12 +1044,12 @@ nocheck:
  
         ret = 0;
  out:
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
         return ret;
  }
  
-static int __hw_perf_event_init(struct perf_event *event)
+static int sparc_pmu_event_init(struct perf_event *event)
  {
         struct perf_event_attr *attr = &event->attr;
         struct perf_event *evts[MAX_HWEVENTS];
@@ -1038,22 +1062,33 @@ static int __hw_perf_event_init(struct perf_event *event)
         if (atomic_read(&nmi_active) < 0)
                 return -ENODEV;
  
-       pmap = NULL;
-       if (attr->type == PERF_TYPE_HARDWARE) {
+       switch (attr->type) {
+       case PERF_TYPE_HARDWARE:
                 if (attr->config >= sparc_pmu->max_events)
                         return -EINVAL;
                 pmap = sparc_pmu->event_map(attr->config);
-       } else if (attr->type == PERF_TYPE_HW_CACHE) {
+               break;
+
+       case PERF_TYPE_HW_CACHE:
                 pmap = sparc_map_cache_event(attr->config);
                 if (IS_ERR(pmap))
                         return PTR_ERR(pmap);
-       } else if (attr->type != PERF_TYPE_RAW)
-               return -EOPNOTSUPP;
+               break;
+
+       case PERF_TYPE_RAW:
+               pmap = NULL;
+               break;
+
+       default:
+               return -ENOENT;
+
+       }
  
         if (pmap) {
                 hwc->event_base = perf_event_encode(pmap);
         } else {
-               /* User gives us "(encoding << 16) | pic_mask" for
+               /*
+                * User gives us "(encoding << 16) | pic_mask" for
                  * PERF_TYPE_RAW events.
                  */
                 hwc->event_base = attr->config;
@@ -1071,7 +1106,7 @@ static int __hw_perf_event_init(struct perf_event *event)
         n = 0;
         if (event->group_leader != event) {
                 n = collect_events(event->group_leader,
-                                  perf_max_events - 1,
+                                  MAX_HWEVENTS - 1,
                                    evts, events, current_idx_dmy);
                 if (n < 0)
                         return -EINVAL;
@@ -1107,10 +1142,11 @@ static int __hw_perf_event_init(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void sparc_pmu_start_txn(const struct pmu *pmu)
+static void sparc_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuhw->group_flag |= PERF_EVENT_TXN;
  }
  
@@ -1119,11 +1155,12 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void sparc_pmu_cancel_txn(const struct pmu *pmu)
+static void sparc_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -1131,7 +1168,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int sparc_pmu_commit_txn(const struct pmu *pmu)
+static int sparc_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int n;
@@ -1147,28 +1184,24 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
                 return -EAGAIN;
  
         cpuc->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = sparc_pmu_enable,
-       .disable        = sparc_pmu_disable,
+static struct pmu pmu = {
+       .pmu_enable     = sparc_pmu_enable,
+       .pmu_disable    = sparc_pmu_disable,
+       .event_init     = sparc_pmu_event_init,
+       .add            = sparc_pmu_add,
+       .del            = sparc_pmu_del,
+       .start          = sparc_pmu_start,
+       .stop           = sparc_pmu_stop,
         .read           = sparc_pmu_read,
-       .unthrottle     = sparc_pmu_unthrottle,
         .start_txn      = sparc_pmu_start_txn,
         .cancel_txn     = sparc_pmu_cancel_txn,
         .commit_txn     = sparc_pmu_commit_txn,
  };
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-       int err = __hw_perf_event_init(event);
-
-       if (err)
-               return ERR_PTR(err);
-       return &pmu;
-}
-
  void perf_event_print_debug(void)
  {
         unsigned long flags;
@@ -1244,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
                         continue;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       sparc_pmu_disable_event(cpuc, hwc, idx);
+                       sparc_pmu_stop(event, 0);
         }
  
         return NOTIFY_STOP;
@@ -1285,28 +1318,21 @@ void __init init_hw_perf_events(void)
  
         pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
  
-       /* All sparc64 PMUs currently have 2 events.  */
-       perf_max_events = 2;
-
+       perf_pmu_register(&pmu);
         register_die_notifier(&perf_event_nmi_notifier);
  }
  
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
-
-static void perf_callchain_kernel(struct pt_regs *regs,
-                                 struct perf_callchain_entry *entry)
+void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                          struct pt_regs *regs)
  {
         unsigned long ksp, fp;
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
         int graph = 0;
  #endif
  
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->tpc);
+       stack_trace_flush();
+
+       perf_callchain_store(entry, regs->tpc);
  
         ksp = regs->u_regs[UREG_I6];
         fp = ksp + STACK_BIAS;
@@ -1330,13 +1356,13 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         pc = sf->callers_pc;
                         fp = (unsigned long)sf->fp + STACK_BIAS;
                 }
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
                 if ((pc + 8UL) == (unsigned long) &return_to_handler) {
                         int index = current->curr_ret_stack;
                         if (current->ret_stack && index >= graph) {
                                 pc = current->ret_stack[index - graph].ret;
-                               callchain_store(entry, pc);
+                               perf_callchain_store(entry, pc);
                                 graph++;
                         }
                 }
@@ -1344,13 +1370,12 @@ static void perf_callchain_kernel(struct pt_regs *regs,
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-static void perf_callchain_user_64(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long ufp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->tpc);
+       perf_callchain_store(entry, regs->tpc);
  
         ufp = regs->u_regs[UREG_I6] + STACK_BIAS;
         do {
@@ -1363,17 +1388,16 @@ static void perf_callchain_user_64(struct pt_regs *regs,
  
                 pc = sf.callers_pc;
                 ufp = (unsigned long)sf.fp + STACK_BIAS;
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-static void perf_callchain_user_32(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long ufp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->tpc);
+       perf_callchain_store(entry, regs->tpc);
  
         ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
         do {
@@ -1386,34 +1410,16 @@ static void perf_callchain_user_32(struct pt_regs *regs,
  
                 pc = sf.callers_pc;
                 ufp = (unsigned long)sf.fp;
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-/* Like powerpc we can't get PMU interrupts within the PMU handler,
- * so no need for separate NMI and IRQ chains as on x86.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-       entry->nr = 0;
-       if (!user_mode(regs)) {
-               stack_trace_flush();
-               perf_callchain_kernel(regs, entry);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-       if (regs) {
-               flushw_user();
-               if (test_thread_flag(TIF_32BIT))
-                       perf_callchain_user_32(regs, entry);
-               else
-                       perf_callchain_user_64(regs, entry);
-       }
-       return entry;
+       flushw_user();
+       if (test_thread_flag(TIF_32BIT))
+               perf_callchain_user_32(entry, regs);
+       else
+               perf_callchain_user_64(entry, regs);
  }
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h

index def500776b16a3b63d34da569021722e4d82f18a..a70cd216be5d729db1f364340f911d632819f18d 100644 (file)
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
  #define P4_ESCR_EMASK(v)       ((v) << P4_ESCR_EVENTMASK_SHIFT)
  #define P4_ESCR_TAG(v)         ((v) << P4_ESCR_TAG_SHIFT)
  
-/* Non HT mask */
-#define P4_ESCR_MASK                   \
-       (P4_ESCR_EVENT_MASK     |       \
-       P4_ESCR_EVENTMASK_MASK  |       \
-       P4_ESCR_TAG_MASK        |       \
-       P4_ESCR_TAG_ENABLE      |       \
-       P4_ESCR_T0_OS           |       \
-       P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT                        \
-       (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
  #define P4_CCCR_OVF                    0x80000000U
  #define P4_CCCR_CASCADE                        0x40000000U
  #define P4_CCCR_OVF_PMI_T0             0x04000000U
@@ -70,23 +57,6 @@
  #define P4_CCCR_THRESHOLD(v)           ((v) << P4_CCCR_THRESHOLD_SHIFT)
  #define P4_CCCR_ESEL(v)                        ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
  
-/* Non HT mask */
-#define P4_CCCR_MASK                           \
-       (P4_CCCR_OVF                    |       \
-       P4_CCCR_CASCADE                 |       \
-       P4_CCCR_OVF_PMI_T0              |       \
-       P4_CCCR_FORCE_OVF               |       \
-       P4_CCCR_EDGE                    |       \
-       P4_CCCR_THRESHOLD_MASK          |       \
-       P4_CCCR_COMPLEMENT              |       \
-       P4_CCCR_COMPARE                 |       \
-       P4_CCCR_ESCR_SELECT_MASK        |       \
-       P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT                                \
-       (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
  #define P4_GEN_ESCR_EMASK(class, name, bit)    \
         class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
  #define P4_ESCR_EMASK_BIT(class, name)         class##__##name
@@ -127,6 +97,28 @@
  #define P4_CONFIG_HT_SHIFT             63
  #define P4_CONFIG_HT                   (1ULL << P4_CONFIG_HT_SHIFT)
  
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR            \
+       P4_ESCR_EVENT_MASK      |       \
+       P4_ESCR_EVENTMASK_MASK  |       \
+       P4_ESCR_TAG_MASK        |       \
+       P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR            \
+       P4_CCCR_EDGE            |       \
+       P4_CCCR_THRESHOLD_MASK  |       \
+       P4_CCCR_COMPLEMENT      |       \
+       P4_CCCR_COMPARE         |       \
+       P4_CCCR_THREAD_ANY      |       \
+       P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK                                   \
+       (p4_config_pack_escr(P4_CONFIG_MASK_ESCR))      | \
+       (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
  static inline bool p4_is_event_cascaded(u64 config)
  {
         u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 3efdf2870a3572263add749326aa5925df7c461f..0fb17050360fb846806aa7f1bc1c9b564219246e 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -530,7 +530,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
  /*
   * Setup the hardware configuration for a given attr_type
   */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
  {
         int err;
  
@@ -583,7 +583,7 @@ static void x86_pmu_disable_all(void)
         }
  }
  
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -618,7 +618,7 @@ static void x86_pmu_enable_all(int added)
         }
  }
  
-static const struct pmu pmu;
+static struct pmu pmu;
  
  static inline int is_x86_event(struct perf_event *event)
  {
@@ -800,10 +800,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
                 hwc->last_tag == cpuc->tags[i];
  }
  
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
  
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct perf_event *event;
@@ -839,7 +839,14 @@ void hw_perf_enable(void)
                             match_prev_assignment(hwc, cpuc, i))
                                 continue;
  
-                       x86_pmu_stop(event);
+                       /*
+                        * Ensure we don't accidentally enable a stopped
+                        * counter simply because we rescheduled.
+                        */
+                       if (hwc->state & PERF_HES_STOPPED)
+                               hwc->state |= PERF_HES_ARCH;
+
+                       x86_pmu_stop(event, PERF_EF_UPDATE);
                 }
  
                 for (i = 0; i < cpuc->n_events; i++) {
@@ -851,7 +858,10 @@ void hw_perf_enable(void)
                         else if (i < n_running)
                                 continue;
  
-                       x86_pmu_start(event);
+                       if (hwc->state & PERF_HES_ARCH)
+                               continue;
+
+                       x86_pmu_start(event, PERF_EF_RELOAD);
                 }
                 cpuc->n_added = 0;
                 perf_events_lapic_init();
@@ -952,15 +962,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
  }
  
  /*
- * activate a single event
+ * Add a single event to the PMU.
   *
   * The event is added to the group of enabled events
   * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
   */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc;
@@ -969,57 +976,66 @@ static int x86_pmu_enable(struct perf_event *event)
  
         hwc = &event->hw;
  
+       perf_pmu_disable(event->pmu);
         n0 = cpuc->n_events;
-       n = collect_events(cpuc, event, false);
-       if (n < 0)
-               return n;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
  
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
-        * at commit time(->commit_txn) as a whole
+        * at commit time (->commit_txn) as a whole
          */
         if (cpuc->group_flag & PERF_EVENT_TXN)
-               goto out;
+               goto done_collect;
  
         ret = x86_pmu.schedule_events(cpuc, n, assign);
         if (ret)
-               return ret;
+               goto out;
         /*
          * copy new assignment, now we know it is possible
          * will be used by hw_perf_enable()
          */
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
-out:
+done_collect:
         cpuc->n_events = n;
         cpuc->n_added += n - n0;
         cpuc->n_txn += n - n0;
  
-       return 0;
+       ret = 0;
+out:
+       perf_pmu_enable(event->pmu);
+       return ret;
  }
  
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int idx = event->hw.idx;
  
-       if (idx == -1)
-               return -EAGAIN;
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               x86_perf_event_set_period(event);
+       }
+
+       event->hw.state = 0;
  
-       x86_perf_event_set_period(event);
         cpuc->events[idx] = event;
         __set_bit(idx, cpuc->active_mask);
         x86_pmu.enable(event);
         perf_event_update_userpage(event);
-
-       return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-       int ret = x86_pmu_start(event);
-       WARN_ON_ONCE(ret);
  }
  
  void perf_event_print_debug(void)
@@ -1076,27 +1092,29 @@ void perf_event_print_debug(void)
         local_irq_restore(flags);
  }
  
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       int idx = hwc->idx;
  
-       if (!__test_and_clear_bit(idx, cpuc->active_mask))
-               return;
-
-       x86_pmu.disable(event);
-
-       /*
-        * Drain the remaining delta count out of a event
-        * that we are disabling:
-        */
-       x86_perf_event_update(event);
+       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+               x86_pmu.disable(event);
+               cpuc->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
  
-       cpuc->events[idx] = NULL;
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               x86_perf_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
  }
  
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int i;
@@ -1109,7 +1127,7 @@ static void x86_pmu_disable(struct perf_event *event)
         if (cpuc->group_flag & PERF_EVENT_TXN)
                 return;
  
-       x86_pmu_stop(event);
+       x86_pmu_stop(event, PERF_EF_UPDATE);
  
         for (i = 0; i < cpuc->n_events; i++) {
                 if (event == cpuc->event_list[i]) {
@@ -1161,7 +1179,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                         continue;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       x86_pmu_stop(event);
+                       x86_pmu_stop(event, 0);
         }
  
         if (handled)
@@ -1378,7 +1396,6 @@ void __init init_hw_perf_events(void)
                 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
         }
         x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-       perf_max_events = x86_pmu.num_counters;
  
         if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
                 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1414,6 +1431,7 @@ void __init init_hw_perf_events(void)
         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(x86_pmu_notifier);
  }
  
@@ -1427,10 +1445,11 @@ static inline void x86_pmu_read(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuc->group_flag |= PERF_EVENT_TXN;
         cpuc->n_txn = 0;
  }
@@ -1440,7 +1459,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -1450,6 +1469,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
          */
         cpuc->n_added -= cpuc->n_txn;
         cpuc->n_events -= cpuc->n_txn;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -1457,7 +1477,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int assign[X86_PMC_IDX_MAX];
@@ -1479,22 +1499,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
         cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = x86_pmu_enable,
-       .disable        = x86_pmu_disable,
-       .start          = x86_pmu_start,
-       .stop           = x86_pmu_stop,
-       .read           = x86_pmu_read,
-       .unthrottle     = x86_pmu_unthrottle,
-       .start_txn      = x86_pmu_start_txn,
-       .cancel_txn     = x86_pmu_cancel_txn,
-       .commit_txn     = x86_pmu_commit_txn,
-};
-
  /*
   * validate that we can schedule this event
   */
@@ -1569,12 +1577,22 @@ out:
         return ret;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
  {
-       const struct pmu *tmp;
+       struct pmu *tmp;
         int err;
  
-       err = __hw_perf_event_init(event);
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
         if (!err) {
                 /*
                  * we temporarily connect event to its pmu
@@ -1594,26 +1612,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (err) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
  }
  
-/*
- * callchain support
- */
+static struct pmu pmu = {
+       .pmu_enable     = x86_pmu_enable,
+       .pmu_disable    = x86_pmu_disable,
  
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
+       .event_init     = x86_pmu_event_init,
  
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+       .add            = x86_pmu_add,
+       .del            = x86_pmu_del,
+       .start          = x86_pmu_start,
+       .stop           = x86_pmu_stop,
+       .read           = x86_pmu_read,
+
+       .start_txn      = x86_pmu_start_txn,
+       .cancel_txn     = x86_pmu_cancel_txn,
+       .commit_txn     = x86_pmu_commit_txn,
+};
  
+/*
+ * callchain support
+ */
  
  static void
  backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1635,7 +1658,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  {
         struct perf_callchain_entry *entry = data;
  
-       callchain_store(entry, addr);
+       perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops backtrace_ops = {
@@ -1646,11 +1669,15 @@ static const struct stacktrace_ops backtrace_ops = {
         .walk_stack             = print_context_stack_bp,
  };
  
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->ip);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
  
         dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
  }
@@ -1679,7 +1706,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if (fp < compat_ptr(regs->sp))
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = compat_ptr(frame.next_frame);
         }
         return 1;
@@ -1692,19 +1719,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
  }
  #endif
  
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stack_frame frame;
         const void __user *fp;
  
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
  
         fp = (void __user *)regs->bp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->ip);
+       perf_callchain_store(entry, regs->ip);
  
         if (perf_callchain_user32(regs, entry))
                 return;
@@ -1721,52 +1749,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if ((unsigned long)fp < regs->sp)
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = frame.next_frame;
         }
  }
  
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return NULL;
-       }
-
-       if (in_nmi())
-               entry = &__get_cpu_var(pmc_nmi_entry);
-       else
-               entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
-
  unsigned long perf_instruction_pointer(struct pt_regs *regs)
  {
         unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index ee05c90012d269e66de9ffb6a5952d1434317c1e..c8f5c088cad11ae3f245e1e7374bb43c915170d6 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
         struct cpu_hw_events *cpuc;
         int bit, loops;
         u64 status;
-       int handled = 0;
+       int handled;
  
         perf_sample_data_init(&data, 0);
  
         cpuc = &__get_cpu_var(cpu_hw_events);
  
         intel_pmu_disable_all();
-       intel_pmu_drain_bts_buffer();
+       handled = intel_pmu_drain_bts_buffer();
         status = intel_pmu_get_status();
         if (!status) {
                 intel_pmu_enable_all(0);
-               return 0;
+               return handled;
         }
  
         loops = 0;
@@ -763,7 +763,7 @@ again:
                 data.period = event->hw.last_period;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       x86_pmu_stop(event);
+                       x86_pmu_stop(event, 0);
         }
  
         /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

index 18018d1311cdf3fa7f550b2fd7894c278505afe6..4977f9c400e5738cb668efc937c69cde22a2772d 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void)
         update_debugctlmsr(debugctlmsr);
  }
  
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct debug_store *ds = cpuc->ds;
@@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void)
         struct pt_regs regs;
  
         if (!event)
-               return;
+               return 0;
  
         if (!ds)
-               return;
+               return 0;
  
         at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
         top = (struct bts_record *)(unsigned long)ds->bts_index;
  
         if (top <= at)
-               return;
+               return 0;
  
         ds->bts_index = ds->bts_buffer_base;
  
@@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void)
         perf_prepare_sample(&header, &data, event, &regs);
  
         if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
-               return;
+               return 1;
  
         for (; at < top; at++) {
                 data.ip         = at->from;
@@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void)
         /* There's new data available. */
         event->hw.interrupts++;
         event->pending_kill = POLL_IN;
+       return 1;
  }
  
  /*
@@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
                 regs.flags &= ~PERF_EFLAGS_EXACT;
  
         if (perf_event_overflow(event, 1, &data, &regs))
-               x86_pmu_stop(event);
+               x86_pmu_stop(event, 0);
  }
  
  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c

index b560db3305be16ff954fc416137d17b17189b5e7..c70c878ee02a022b0a92a070139b57e78d3ef943 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
  struct p4_event_bind {
         unsigned int opcode;                    /* Event code and ESCR selector */
         unsigned int escr_msr[2];               /* ESCR MSR for this event */
+       unsigned int escr_emask;                /* valid ESCR EventMask bits */
+       unsigned int shared;                    /* event is shared across threads */
         char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
  };
  
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
         [P4_EVENT_TC_DELIVER_MODE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
                 .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+               .shared         = 1,
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_BPU_FETCH_REQUEST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
                 .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_ITLB_REFERENCE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
                 .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_MEMORY_CANCEL] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
                 .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_MEMORY_COMPLETE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_LOAD_PORT_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_STORE_PORT_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_MOB_LOAD_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
                 .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_PAGE_WALK_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
                 .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+               .shared         = 1,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BSQ_CACHE_REFERENCE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
                 .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_IOQ_ALLOCATION] = {
                 .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                 .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
                 .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
                 .cntr           = { {2, -1, -1}, {3, -1, -1} },
         },
         [P4_EVENT_FSB_DATA_ACTIVITY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+               .shared         = 1,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
                 .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
                 .cntr           = { {0, -1, -1}, {1, -1, -1} },
         },
         [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
                 .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
                 .cntr           = { {2, -1, -1}, {3, -1, -1} },
         },
         [P4_EVENT_SSE_INPUT_ASSIST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_PACKED_SP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_PACKED_DP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_SCALAR_SP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_SCALAR_DP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_64BIT_MMX_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_128BIT_MMX_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_X87_FP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_TC_MISC] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
                 .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_GLOBAL_POWER_EVENTS] = {
                 .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_TC_MS_XFER] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
                 .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_UOP_QUEUE_WRITES] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
                 .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
                 .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RETIRED_BRANCH_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
                 .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RESOURCE_STALL] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
                 .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_WC_BUFFER] = {
                 .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
                 .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_B2B_CYCLES] = {
                 .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BNR] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BNR),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_SNOOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_RESPONSE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_FRONT_END_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_EXECUTION_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_REPLAY_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_INSTR_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_UOPS_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_UOP_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
                 .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_BRANCH_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_X87_ASSIST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_MACHINE_CLEAR] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_INSTR_COMPLETED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
  };
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
         return config;
  }
  
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+               if (boot_cpu_data.x86_model != 3 &&
+                       boot_cpu_data.x86_model != 4 &&
+                       boot_cpu_data.x86_model != 6)
+                       return false;
+       }
+
+       /*
+        * For info
+        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+        */
+
+       return true;
+}
+
  static int p4_validate_raw_event(struct perf_event *event)
  {
-       unsigned int v;
+       unsigned int v, emask;
  
-       /* user data may have out-of-bound event index */
+       /* User data may have out-of-bound event index */
         v = p4_config_unpack_event(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_event_bind_map)) {
-               pr_warning("P4 PMU: Unknown event code: %d\n", v);
+       if (v >= ARRAY_SIZE(p4_event_bind_map))
+               return -EINVAL;
+
+       /* It may be unsupported: */
+       if (!p4_event_match_cpu_model(v))
                 return -EINVAL;
+
+       /*
+        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+        * in Architectural Performance Monitoring, it means not
+        * on _which_ logical cpu to count but rather _when_, ie it
+        * depends on logical cpu state -- count event if one cpu active,
+        * none, both or any, so we just allow user to pass any value
+        * desired.
+        *
+        * In turn we always set Tx_OS/Tx_USR bits bound to logical
+        * cpu without their propagation to another cpu
+        */
+
+       /*
+        * if an event is shared accross the logical threads
+        * the user needs special permissions to be able to use it
+        */
+       if (p4_event_bind_map[v].shared) {
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
         }
  
+       /* ESCR EventMask bits may be invalid */
+       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+       if (emask & ~p4_event_bind_map[v].escr_emask)
+               return -EINVAL;
+
         /*
-        * it may have some screwed PEBS bits
+        * it may have some invalid PEBS bits
          */
-       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
-               pr_warning("P4 PMU: PEBS are not supported yet\n");
+       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
                 return -EINVAL;
-       }
+
         v = p4_config_unpack_metric(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
-               pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
                 return -EINVAL;
-       }
  
         return 0;
  }
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
  
         if (event->attr.type == PERF_TYPE_RAW) {
  
+               /*
+                * Clear bits we reserve to be managed by kernel itself
+                * and never allowed from a user space
+                */
+                event->attr.config &= P4_CONFIG_MASK;
+
                 rc = p4_validate_raw_event(event);
                 if (rc)
                         goto out;
  
                 /*
-                * We don't control raw events so it's up to the caller
-                * to pass sane values (and we don't count the thread number
-                * on HT machine but allow HT-compatible specifics to be
-                * passed on)
-                *
                  * Note that for RAW events we allow user to use P4_CCCR_RESERVED
                  * bits since we keep additional info here (for cache events and etc)
-                *
-                * XXX: HT wide things should check perf_paranoid_cpu() &&
-                *      CAP_SYS_ADMIN
                  */
-               event->hw.config |= event->attr.config &
-                       (p4_config_pack_escr(P4_ESCR_MASK_HT) |
-                        p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
-               event->hw.config &= ~P4_CCCR_FORCE_OVF;
+               event->hw.config |= event->attr.config;
         }
  
         rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

index 770ebfb349e93efe3367cf0c6caff93b61b8b884..e05952af5d26ff62f56d590510d1d170c59b16fb 100644 (file)
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
         return 0;
  }
  
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
  /* Check if paddr is at an instruction boundary */
  static int __kprobes can_probe(unsigned long paddr)
  {
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
         struct insn insn;
         kprobe_opcode_t buf[MAX_INSN_SIZE];
  
-       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+       if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
                 return 0;
  
         /* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
         *(unsigned long *)addr = val;
  }
  
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
  {
         asm volatile (
                         ".global optprobe_template_entry\n"
@@ -1269,11 +1266,9 @@ static int __kprobes can_optimize(unsigned long paddr)
         unsigned long addr, size = 0, offset = 0;
         struct insn insn;
         kprobe_opcode_t buf[MAX_INSN_SIZE];
-       /* Dummy buffers for lookup_symbol_attrs */
-       static char __dummy_buf[KSYM_NAME_LEN];
  
         /* Lookup symbol including addr */
-       if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+       if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
                 return 0;
  
         /* Check there is enough space for a relative jump. */
diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h

index 62f59080e5cc215edb843f928763d9898055071f..04d0a977cd431fc5eb2c2f34cbd5390bfeb0db05 100644 (file)
--- a/include/asm-generic/hardirq.h
+++ b/include/asm-generic/hardirq.h
@@ -3,13 +3,13 @@
  
  #include <linux/cache.h>
  #include <linux/threads.h>
-#include <linux/irq.h>
  
  typedef struct {
         unsigned int __softirq_pending;
  } ____cacheline_aligned irq_cpustat_t;
  
  #include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
+#include <linux/irq.h>
  
  #ifndef ack_bad_irq
  static inline void ack_bad_irq(unsigned int irq)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h

index 02b8b24f8f51f0e37156731ba94da19ba2d19131..8beabb958f61d5147c8893f1e780415a91fcb2e6 100644 (file)
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -191,8 +191,8 @@ struct ftrace_event_call {
         unsigned int            flags;
  
  #ifdef CONFIG_PERF_EVENTS
-       int                     perf_refcount;
-       struct hlist_head       *perf_events;
+       int                             perf_refcount;
+       struct hlist_head __percpu      *perf_events;
  #endif
  };
  
@@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
  
  extern int  perf_trace_init(struct perf_event *event);
  extern void perf_trace_destroy(struct perf_event *event);
-extern int  perf_trace_enable(struct perf_event *event);
-extern void perf_trace_disable(struct perf_event *event);
+extern int  perf_trace_add(struct perf_event *event, int flags);
+extern void perf_trace_del(struct perf_event *event, int flags);
  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                      char *filter_str);
  extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index a0384a4d1e6f4da4d39a02c0f8ba6634842dd236..531495db17081efabcb649a3740673f03a531e67 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
  #include <asm/atomic.h>
  #include <asm/ptrace.h>
  #include <asm/system.h>
+#include <trace/events/irq.h>
  
  /*
   * These correspond to the IORESOURCE_IRQ_* defines in
@@ -407,7 +408,12 @@ asmlinkage void do_softirq(void);
  asmlinkage void __do_softirq(void);
  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
  extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+static inline void __raise_softirq_irqoff(unsigned int nr)
+{
+       trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL);
+       or_softirq_pending(1UL << nr);
+}
+
  extern void raise_softirq_irqoff(unsigned int nr);
  extern void raise_softirq(unsigned int nr);
  extern void wakeup_softirqd(void);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h

index 49466b13c5c6b310f36b31c052573864df6f02a0..0eb50832aa00fd1bbc31cc23837abfc698412402 100644 (file)
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -39,6 +39,15 @@
         preempt_enable();                               \
  } while (0)
  
+#define get_cpu_ptr(var) ({                            \
+       preempt_disable();                              \
+       this_cpu_ptr(var); })
+
+#define put_cpu_ptr(var) do {                          \
+       (void)(var);                                    \
+       preempt_enable();                               \
+} while (0)
+
  #ifdef CONFIG_SMP
  
  /* minimum unit size, also is the maximum supported allocation size */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 716f99b682c1a57fb3b6f1f72e90aec3982ca5fd..61b1e2d760fdf6bf703ae6c05c0cdae0ac78ee73 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -529,7 +529,6 @@ struct hw_perf_event {
                         int             last_cpu;
                 };
                 struct { /* software */
-                       s64             remaining;
                         struct hrtimer  hrtimer;
                 };
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -539,6 +538,7 @@ struct hw_perf_event {
                 };
  #endif
         };
+       int                             state;
         local64_t                       prev_count;
         u64                             sample_period;
         u64                             last_period;
@@ -550,6 +550,13 @@ struct hw_perf_event {
  #endif
  };
  
+/*
+ * hw_perf_event::state flags
+ */
+#define PERF_HES_STOPPED       0x01 /* the counter is stopped */
+#define PERF_HES_UPTODATE      0x02 /* event->count up-to-date */
+#define PERF_HES_ARCH          0x04
+
  struct perf_event;
  
  /*
@@ -561,36 +568,70 @@ struct perf_event;
   * struct pmu - generic performance monitoring unit
   */
  struct pmu {
-       int (*enable)                   (struct perf_event *event);
-       void (*disable)                 (struct perf_event *event);
-       int (*start)                    (struct perf_event *event);
-       void (*stop)                    (struct perf_event *event);
-       void (*read)                    (struct perf_event *event);
-       void (*unthrottle)              (struct perf_event *event);
+       struct list_head                entry;
+
+       int * __percpu                  pmu_disable_count;
+       struct perf_cpu_context * __percpu pmu_cpu_context;
+       int                             task_ctx_nr;
  
         /*
-        * Group events scheduling is treated as a transaction, add group
-        * events as a whole and perform one schedulability test. If the test
-        * fails, roll back the whole group
+        * Fully disable/enable this PMU, can be used to protect from the PMI
+        * as well as for lazy/batch writing of the MSRs.
          */
+       void (*pmu_enable)              (struct pmu *pmu); /* optional */
+       void (*pmu_disable)             (struct pmu *pmu); /* optional */
  
         /*
-        * Start the transaction, after this ->enable() doesn't need
-        * to do schedulability tests.
+        * Try and initialize the event for this PMU.
+        * Should return -ENOENT when the @event doesn't match this PMU.
          */
-       void (*start_txn)       (const struct pmu *pmu);
+       int (*event_init)               (struct perf_event *event);
+
+#define PERF_EF_START  0x01            /* start the counter when adding    */
+#define PERF_EF_RELOAD 0x02            /* reload the counter when starting */
+#define PERF_EF_UPDATE 0x04            /* update the counter when stopping */
+
         /*
-        * If ->start_txn() disabled the ->enable() schedulability test
+        * Adds/Removes a counter to/from the PMU, can be done inside
+        * a transaction, see the ->*_txn() methods.
+        */
+       int  (*add)                     (struct perf_event *event, int flags);
+       void (*del)                     (struct perf_event *event, int flags);
+
+       /*
+        * Starts/Stops a counter present on the PMU. The PMI handler
+        * should stop the counter when perf_event_overflow() returns
+        * !0. ->start() will be used to continue.
+        */
+       void (*start)                   (struct perf_event *event, int flags);
+       void (*stop)                    (struct perf_event *event, int flags);
+
+       /*
+        * Updates the counter value of the event.
+        */
+       void (*read)                    (struct perf_event *event);
+
+       /*
+        * Group events scheduling is treated as a transaction, add
+        * group events as a whole and perform one schedulability test.
+        * If the test fails, roll back the whole group
+        *
+        * Start the transaction, after this ->add() doesn't need to
+        * do schedulability tests.
+        */
+       void (*start_txn)       (struct pmu *pmu); /* optional */
+       /*
+        * If ->start_txn() disabled the ->add() schedulability test
          * then ->commit_txn() is required to perform one. On success
          * the transaction is closed. On error the transaction is kept
          * open until ->cancel_txn() is called.
          */
-       int  (*commit_txn)      (const struct pmu *pmu);
+       int  (*commit_txn)      (struct pmu *pmu); /* optional */
         /*
-        * Will cancel the transaction, assumes ->disable() is called for
-        * each successfull ->enable() during the transaction.
+        * Will cancel the transaction, assumes ->del() is called
+        * for each successfull ->add() during the transaction.
          */
-       void (*cancel_txn)      (const struct pmu *pmu);
+       void (*cancel_txn)      (struct pmu *pmu); /* optional */
  };
  
  /**
@@ -669,7 +710,7 @@ struct perf_event {
         int                             nr_siblings;
         int                             group_flags;
         struct perf_event               *group_leader;
-       const struct pmu                *pmu;
+       struct pmu                      *pmu;
  
         enum perf_event_active_state    state;
         unsigned int                    attach_state;
@@ -763,12 +804,19 @@ struct perf_event {
  #endif /* CONFIG_PERF_EVENTS */
  };
  
+enum perf_event_context_type {
+       task_context,
+       cpu_context,
+};
+
  /**
   * struct perf_event_context - event context structure
   *
   * Used as a container for task events and CPU events as well:
   */
  struct perf_event_context {
+       enum perf_event_context_type    type;
+       struct pmu                      *pmu;
         /*
          * Protect the states of the events in the list,
          * nr_active, and the list:
@@ -808,6 +856,12 @@ struct perf_event_context {
         struct rcu_head                 rcu_head;
  };
  
+/*
+ * Number of contexts where an event can trigger:
+ *     task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS       4
+
  /**
   * struct perf_event_cpu_context - per cpu event context structure
   */
@@ -815,18 +869,9 @@ struct perf_cpu_context {
         struct perf_event_context       ctx;
         struct perf_event_context       *task_ctx;
         int                             active_oncpu;
-       int                             max_pertask;
         int                             exclusive;
-       struct swevent_hlist            *swevent_hlist;
-       struct mutex                    hlist_mutex;
-       int                             hlist_refcount;
-
-       /*
-        * Recursion avoidance:
-        *
-        * task, softirq, irq, nmi context
-        */
-       int                             recursion[4];
+       struct list_head                rotation_list;
+       int                             jiffies_interval;
  };
  
  struct perf_output_handle {
@@ -842,26 +887,20 @@ struct perf_output_handle {
  
  #ifdef CONFIG_PERF_EVENTS
  
-/*
- * Set by architecture code:
- */
-extern int perf_max_events;
-
-extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+extern int perf_pmu_register(struct pmu *pmu);
+extern void perf_pmu_unregister(struct pmu *pmu);
  
  extern void perf_event_task_sched_in(struct task_struct *task);
  extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-extern void perf_event_task_tick(struct task_struct *task);
  extern int perf_event_init_task(struct task_struct *child);
  extern void perf_event_exit_task(struct task_struct *child);
  extern void perf_event_free_task(struct task_struct *task);
+extern void perf_event_delayed_put(struct task_struct *task);
  extern void set_perf_event_pending(void);
  extern void perf_event_do_pending(void);
  extern void perf_event_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
+extern void perf_pmu_disable(struct pmu *pmu);
+extern void perf_pmu_enable(struct pmu *pmu);
  extern int perf_event_task_disable(void);
  extern int perf_event_task_enable(void);
  extern void perf_event_update_userpage(struct perf_event *event);
@@ -869,7 +908,7 @@ extern int perf_event_release_kernel(struct perf_event *event);
  extern struct perf_event *
  perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                 int cpu,
-                               pid_t pid,
+                               struct task_struct *task,
                                 perf_overflow_handler_t callback);
  extern u64 perf_event_read_value(struct perf_event *event,
                                  u64 *enabled, u64 *running);
@@ -920,14 +959,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
   */
  static inline int is_software_event(struct perf_event *event)
  {
-       switch (event->attr.type) {
-       case PERF_TYPE_SOFTWARE:
-       case PERF_TYPE_TRACEPOINT:
-       /* for now the breakpoint stuff also works as software event */
-       case PERF_TYPE_BREAKPOINT:
-               return 1;
-       }
-       return 0;
+       return event->pmu->task_ctx_nr == perf_sw_context;
  }
  
  extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -976,7 +1008,21 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
  extern void perf_event_comm(struct task_struct *tsk);
  extern void perf_event_fork(struct task_struct *tsk);
  
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+/* Callchains */
+DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
+
+extern void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs);
+
+
+static inline void
+perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+       if (entry->nr < PERF_MAX_STACK_DEPTH)
+               entry->ip[entry->nr++] = ip;
+}
  
  extern int sysctl_perf_event_paranoid;
  extern int sysctl_perf_event_mlock;
@@ -1019,21 +1065,19 @@ extern int perf_swevent_get_recursion_context(void);
  extern void perf_swevent_put_recursion_context(int rctx);
  extern void perf_event_enable(struct perf_event *event);
  extern void perf_event_disable(struct perf_event *event);
+extern void perf_event_task_tick(void);
  #else
  static inline void
  perf_event_task_sched_in(struct task_struct *task)                     { }
  static inline void
  perf_event_task_sched_out(struct task_struct *task,
                             struct task_struct *next)                   { }
-static inline void
-perf_event_task_tick(struct task_struct *task)                         { }
  static inline int perf_event_init_task(struct task_struct *child)      { return 0; }
  static inline void perf_event_exit_task(struct task_struct *child)     { }
  static inline void perf_event_free_task(struct task_struct *task)      { }
+static inline void perf_event_delayed_put(struct task_struct *task)    { }
  static inline void perf_event_do_pending(void)                         { }
  static inline void perf_event_print_debug(void)                                { }
-static inline void perf_disable(void)                                  { }
-static inline void perf_enable(void)                                   { }
  static inline int perf_event_task_disable(void)                                { return -EINVAL; }
  static inline int perf_event_task_enable(void)                         { return -EINVAL; }
  
@@ -1056,6 +1100,7 @@ static inline int  perf_swevent_get_recursion_context(void)               { return -1; }
  static inline void perf_swevent_put_recursion_context(int rctx)                { }
  static inline void perf_event_enable(struct perf_event *event)         { }
  static inline void perf_event_disable(struct perf_event *event)                { }
+static inline void perf_event_task_tick(void)                          { }
  #endif
  
  #define perf_output_put(handle, x) \
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 1e2a6db2d7dd03466bf850dc5011860c23e8f9c9..eb3c1ceec06e1f9f1f2833258c737008d8df79b9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,6 +1160,13 @@ struct sched_rt_entity {
  
  struct rcu_node;
  
+enum perf_event_task_context {
+       perf_invalid_context = -1,
+       perf_hw_context = 0,
+       perf_sw_context,
+       perf_nr_task_contexts,
+};
+
  struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@ -1431,7 +1438,7 @@ struct task_struct {
         struct futex_pi_state *pi_state_cache;
  #endif
  #ifdef CONFIG_PERF_EVENTS
-       struct perf_event_context *perf_event_ctxp;
+       struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
  #endif
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h

index 0e4cfb694fe70630457af67e1b1bc568f56c9b09..6fa7cbab7d932c6649e9fbd4221615b6b4d0fd8d 100644 (file)
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -5,7 +5,9 @@
  #define _TRACE_IRQ_H
  
  #include <linux/tracepoint.h>
-#include <linux/interrupt.h>
+
+struct irqaction;
+struct softirq_action;
  
  #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
  #define show_softirq_name(val)                         \
@@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq,
         ),
  
         TP_fast_assign(
-               __entry->vec = (int)(h - vec);
+               if (vec)
+                       __entry->vec = (int)(h - vec);
+               else
+                       __entry->vec = (int)(long)h;
         ),
  
         TP_printk("vec=%d [action=%s]", __entry->vec,
@@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit,
         TP_ARGS(h, vec)
  );
  
+/**
+ * softirq_raise - called immediately when a softirq is raised
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the softirq vector number which is
+ * raised. @vec is NULL and it means @h includes vector number not
+ * softirq_action. When used in combination with the softirq_entry tracepoint
+ * we can determine the softirq raise latency.
+ */
+DEFINE_EVENT(softirq, softirq_raise,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec)
+);
+
  #endif /*  _TRACE_IRQ_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h

index 188deca2f3c7721a1baac60cc07e8d7006442c71..8fe1e93f531dd81a8e549689a1b8e9b551231e9e 100644 (file)
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -6,10 +6,31 @@
  
  #include <linux/netdevice.h>
  #include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+#define NO_DEV "(no_device)"
+
+TRACE_EVENT(napi_poll,
  
-DECLARE_TRACE(napi_poll,
         TP_PROTO(struct napi_struct *napi),
-       TP_ARGS(napi));
+
+       TP_ARGS(napi),
+
+       TP_STRUCT__entry(
+               __field(        struct napi_struct *,   napi)
+               __string(       dev_name, napi->dev ? napi->dev->name : NO_DEV)
+       ),
+
+       TP_fast_assign(
+               __entry->napi = napi;
+               __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+       ),
+
+       TP_printk("napi poll on napi struct %p for device %s",
+               __entry->napi, __get_str(dev_name))
+);
+
+#undef NO_DEV
  
  #endif /* _TRACE_NAPI_H_ */
  
diff --git a/include/trace/events/net.h b/include/trace/events/net.h

new file mode 100644 (file)

index 0000000..5f247f5
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,82 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM net
+
+#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(net_dev_xmit,
+
+       TP_PROTO(struct sk_buff *skb,
+                int rc),
+
+       TP_ARGS(skb, rc),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned int,   len             )
+               __field(        int,            rc              )
+               __string(       name,           skb->dev->name  )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               __entry->len = skb->len;
+               __entry->rc = rc;
+               __assign_str(name, skb->dev->name);
+       ),
+
+       TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+               __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+);
+
+DECLARE_EVENT_CLASS(net_dev_template,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned int,   len             )
+               __string(       name,           skb->dev->name  )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               __entry->len = skb->len;
+               __assign_str(name, skb->dev->name);
+       ),
+
+       TP_printk("dev=%s skbaddr=%p len=%u",
+               __get_str(name), __entry->skbaddr, __entry->len)
+)
+
+DEFINE_EVENT(net_dev_template, net_dev_queue,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_receive_skb,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_rx,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+#endif /* _TRACE_NET_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/power.h b/include/trace/events/power.h

index 35a2a6e7bf1e74992b8b83b242c4a507fed4246f..286784d69b8f480343244d8046327a5a7d9883d9 100644 (file)
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -10,12 +10,17 @@
  #ifndef _TRACE_POWER_ENUM_
  #define _TRACE_POWER_ENUM_
  enum {
-       POWER_NONE = 0,
-       POWER_CSTATE = 1,
-       POWER_PSTATE = 2,
+       POWER_NONE      = 0,
+       POWER_CSTATE    = 1,    /* C-State */
+       POWER_PSTATE    = 2,    /* Fequency change or DVFS */
+       POWER_SSTATE    = 3,    /* Suspend */
  };
  #endif
  
+/*
+ * The power events are used for cpuidle & suspend (power_start, power_end)
+ *  and for cpufreq (power_frequency)
+ */
  DECLARE_EVENT_CLASS(power,
  
         TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
@@ -70,6 +75,85 @@ TRACE_EVENT(power_end,
  
  );
  
+/*
+ * The clock events are used for clock enable/disable and for
+ *  clock rate change
+ */
+DECLARE_EVENT_CLASS(clock,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id),
+
+       TP_STRUCT__entry(
+               __string(       name,           name            )
+               __field(        u64,            state           )
+               __field(        u64,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+       ),
+
+       TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+               (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_enable,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_disable,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_set_rate,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+/*
+ * The power domain events are used for power domains transitions
+ */
+DECLARE_EVENT_CLASS(power_domain,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id),
+
+       TP_STRUCT__entry(
+               __string(       name,           name            )
+               __field(        u64,            state           )
+               __field(        u64,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+),
+
+       TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+               (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(power_domain, power_domain_target,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
  #endif /* _TRACE_POWER_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h

index 4b2be6dc76f091647eb30f30a40a49218def8682..75ce9d500d8e3c62dbfffbc0acafca13d11d90aa 100644 (file)
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb,
                 __entry->skbaddr, __entry->protocol, __entry->location)
  );
  
+TRACE_EVENT(consume_skb,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb),
+
+       TP_STRUCT__entry(
+               __field(        void *, skbaddr )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+       ),
+
+       TP_printk("skbaddr=%p", __entry->skbaddr)
+);
+
  TRACE_EVENT(skb_copy_datagram_iovec,
  
         TP_PROTO(const struct sk_buff *skb, int len),
diff --git a/kernel/exit.c b/kernel/exit.c

index 03120229db2802929065a210930e41c7fa701ba0..e2bdf37f9fdea71a15acb3523ec20b63a6aa42d3 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
  {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
-#ifdef CONFIG_PERF_EVENTS
-       WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+       perf_event_delayed_put(tsk);
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
  }
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c

index c7c2aed9e2dcc2e52669e7d3880be0666c496bff..3b714e839c1053ec102f99bf8d924424bdf09381 100644 (file)
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
                             perf_overflow_handler_t triggered,
                             struct task_struct *tsk)
  {
-       return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
-                                               triggered);
+       return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
  }
  EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
  
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
         get_online_cpus();
         for_each_online_cpu(cpu) {
                 pevent = per_cpu_ptr(cpu_events, cpu);
-               bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+               bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
  
                 *pevent = bp;
  
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
         .priority = 0x7fffffff
  };
  
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+       release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+       int err;
+
+       if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+               return -ENOENT;
+
+       err = register_perf_hw_breakpoint(bp);
+       if (err)
+               return err;
+
+       bp->destroy = bp_perf_event_destroy;
+
+       return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+       if (!(flags & PERF_EF_START))
+               bp->hw.state = PERF_HES_STOPPED;
+
+       return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+       arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+       bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+       bp->hw.state = PERF_HES_STOPPED;
+}
+
+static struct pmu perf_breakpoint = {
+       .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
+
+       .event_init     = hw_breakpoint_event_init,
+       .add            = hw_breakpoint_add,
+       .del            = hw_breakpoint_del,
+       .start          = hw_breakpoint_start,
+       .stop           = hw_breakpoint_stop,
+       .read           = hw_breakpoint_pmu_read,
+};
+
  static int __init init_hw_breakpoint(void)
  {
         unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
  
         constraints_initialized = 1;
  
+       perf_pmu_register(&perf_breakpoint);
+
         return register_die_notifier(&hw_breakpoint_exceptions_nb);
  
   err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
  core_initcall(init_hw_breakpoint);
  
  
-struct pmu perf_ops_bp = {
-       .enable         = arch_install_hw_breakpoint,
-       .disable        = arch_uninstall_hw_breakpoint,
-       .read           = hw_breakpoint_pmu_read,
-};
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index 282035f3ae964e1e288f352c370be8edd11d3078..6dd5359e1f0e911103fcc935d8844d253a6ef073 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -399,7 +399,7 @@ static inline int kprobe_optready(struct kprobe *p)
   * Return an optimized kprobe whose optimizing code replaces
   * instructions including addr (exclude breakpoint).
   */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
  {
         int i;
         struct kprobe *p = NULL;
@@ -831,6 +831,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
  
  void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
                          struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
  {
         unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
         spinlock_t *hlist_lock;
@@ -842,6 +843,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
  
  static void __kprobes kretprobe_table_lock(unsigned long hash,
         unsigned long *flags)
+__acquires(hlist_lock)
  {
         spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
         spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +851,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
  
  void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
         unsigned long *flags)
+__releases(hlist_lock)
  {
         unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
         spinlock_t *hlist_lock;
@@ -857,7 +860,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
         spin_unlock_irqrestore(hlist_lock, *flags);
  }
  
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
  {
         spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
         spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1339,18 +1344,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
         if (num <= 0)
                 return -EINVAL;
         for (i = 0; i < num; i++) {
-               unsigned long addr;
+               unsigned long addr, offset;
                 jp = jps[i];
                 addr = arch_deref_entry_point(jp->entry);
  
-               if (!kernel_text_address(addr))
-                       ret = -EINVAL;
-               else {
-                       /* Todo: Verify probepoint is a function entry point */
+               /* Verify probepoint is a function entry point */
+               if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
+                   offset == 0) {
                         jp->kp.pre_handler = setjmp_pre_handler;
                         jp->kp.break_handler = longjmp_break_handler;
                         ret = register_kprobe(&jp->kp);
-               }
+               } else
+                       ret = -EINVAL;
+
                 if (ret < 0) {
                         if (i > 0)
                                 unregister_jprobes(jps, i);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

index db5b56064687e453c0df1cc118b975ea047bdcae..c16158c77dfd015a64ee0d0efac9b7617224dd39 100644 (file)
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
  #include <linux/kernel_stat.h>
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
  
  #include <asm/irq_regs.h>
  
-/*
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
  static atomic_t nr_events __read_mostly;
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
  static atomic_t nr_task_events __read_mostly;
  
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
  /*
   * perf event paranoia level:
   *  -1 - not paranoid at all
@@ -67,36 +61,38 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
  
  static atomic64_t perf_event_id;
  
-/*
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
+void __weak perf_event_print_debug(void)       { }
  
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+void perf_pmu_disable(struct pmu *pmu)
  {
-       return NULL;
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!(*count)++)
+               pmu->pmu_disable(pmu);
  }
  
-void __weak hw_perf_disable(void)              { barrier(); }
-void __weak hw_perf_enable(void)               { barrier(); }
-
-void __weak perf_event_print_debug(void)       { }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
  {
-       if (!__get_cpu_var(perf_disable_count)++)
-               hw_perf_disable();
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!--(*count))
+               pmu->pmu_enable(pmu);
  }
  
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
  {
-       if (!--__get_cpu_var(perf_disable_count))
-               hw_perf_enable();
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct list_head *head = &__get_cpu_var(rotation_list);
+
+       WARN_ON(!irqs_disabled());
+
+       if (list_empty(&cpuctx->rotation_list))
+               list_add(&cpuctx->rotation_list, head);
  }
  
  static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +147,13 @@ static u64 primary_event_id(struct perf_event *event)
   * the context could get moved to another task.
   */
  static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  {
         struct perf_event_context *ctx;
  
         rcu_read_lock();
- retry:
-       ctx = rcu_dereference(task->perf_event_ctxp);
+retry:
+       ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
         if (ctx) {
                 /*
                  * If this context is a clone of another, it might
@@ -170,7 +166,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                  * can't get swapped on us any more.
                  */
                 raw_spin_lock_irqsave(&ctx->lock, *flags);
-               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+               if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                         goto retry;
                 }
@@ -189,12 +185,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
   * can't get swapped to another task.  This also increments its
   * reference count so that the context can't get freed.
   */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
  {
         struct perf_event_context *ctx;
         unsigned long flags;
  
-       ctx = perf_lock_task_context(task, &flags);
+       ctx = perf_lock_task_context(task, ctxn, &flags);
         if (ctx) {
                 ++ctx->pin_count;
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +299,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         }
  
         list_add_rcu(&event->event_entry, &ctx->event_list);
+       if (!ctx->nr_events)
+               perf_pmu_rotate_start(ctx->pmu);
         ctx->nr_events++;
         if (event->attr.inherit_stat)
                 ctx->nr_stat++;
@@ -436,7 +435,7 @@ event_sched_out(struct perf_event *event,
                 event->state = PERF_EVENT_STATE_OFF;
         }
         event->tstamp_stopped = ctx->time;
-       event->pmu->disable(event);
+       event->pmu->del(event, 0);
         event->oncpu = -1;
  
         if (!is_software_event(event))
@@ -466,6 +465,12 @@ group_sched_out(struct perf_event *group_event,
                 cpuctx->exclusive = 0;
  }
  
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
  /*
   * Cross CPU call to remove a performance event
   *
@@ -474,9 +479,9 @@ group_sched_out(struct perf_event *group_event,
   */
  static void __perf_event_remove_from_context(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a task context, we need to check whether it is
@@ -487,27 +492,11 @@ static void __perf_event_remove_from_context(void *info)
                 return;
  
         raw_spin_lock(&ctx->lock);
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level.
-        */
-       perf_disable();
  
         event_sched_out(event, cpuctx, ctx);
  
         list_del_event(event, ctx);
  
-       if (!ctx->task) {
-               /*
-                * Allow more per task events with respect to the
-                * reservation:
-                */
-               cpuctx->max_pertask =
-                       min(perf_max_events - ctx->nr_events,
-                           perf_max_events - perf_reserved_percpu);
-       }
-
-       perf_enable();
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -572,8 +561,8 @@ retry:
  static void __perf_event_disable(void *info)
  {
         struct perf_event *event = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a per-task event, need to check whether this
@@ -628,7 +617,7 @@ void perf_event_disable(struct perf_event *event)
                 return;
         }
  
- retry:
+retry:
         task_oncpu_function_call(task, __perf_event_disable, event);
  
         raw_spin_lock_irq(&ctx->lock);
@@ -667,7 +656,7 @@ event_sched_in(struct perf_event *event,
          */
         smp_wmb();
  
-       if (event->pmu->enable(event)) {
+       if (event->pmu->add(event, PERF_EF_START)) {
                 event->state = PERF_EVENT_STATE_INACTIVE;
                 event->oncpu = -1;
                 return -EAGAIN;
@@ -691,22 +680,15 @@ group_sched_in(struct perf_event *group_event,
                struct perf_event_context *ctx)
  {
         struct perf_event *event, *partial_group = NULL;
-       const struct pmu *pmu = group_event->pmu;
-       bool txn = false;
+       struct pmu *pmu = group_event->pmu;
  
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
  
-       /* Check if group transaction availabe */
-       if (pmu->start_txn)
-               txn = true;
-
-       if (txn)
-               pmu->start_txn(pmu);
+       pmu->start_txn(pmu);
  
         if (event_sched_in(group_event, cpuctx, ctx)) {
-               if (txn)
-                       pmu->cancel_txn(pmu);
+               pmu->cancel_txn(pmu);
                 return -EAGAIN;
         }
  
@@ -720,7 +702,7 @@ group_sched_in(struct perf_event *group_event,
                 }
         }
  
-       if (!txn || !pmu->commit_txn(pmu))
+       if (!pmu->commit_txn(pmu))
                 return 0;
  
  group_error:
@@ -735,8 +717,7 @@ group_error:
         }
         event_sched_out(group_event, cpuctx, ctx);
  
-       if (txn)
-               pmu->cancel_txn(pmu);
+       pmu->cancel_txn(pmu);
  
         return -EAGAIN;
  }
@@ -789,10 +770,10 @@ static void add_event_to_ctx(struct perf_event *event,
   */
  static void __perf_install_in_context(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *leader = event->group_leader;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
         /*
@@ -812,12 +793,6 @@ static void __perf_install_in_context(void *info)
         ctx->is_active = 1;
         update_context_time(ctx);
  
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level. NOP for non NMI based events.
-        */
-       perf_disable();
-
         add_event_to_ctx(event, ctx);
  
         if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +830,7 @@ static void __perf_install_in_context(void *info)
                 }
         }
  
-       if (!err && !ctx->task && cpuctx->max_pertask)
-               cpuctx->max_pertask--;
-
- unlock:
-       perf_enable();
-
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -883,6 +853,8 @@ perf_install_in_context(struct perf_event_context *ctx,
  {
         struct task_struct *task = ctx->task;
  
+       event->ctx = ctx;
+
         if (!task) {
                 /*
                  * Per cpu events are installed via an smp call and
@@ -931,10 +903,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
  
         event->state = PERF_EVENT_STATE_INACTIVE;
         event->tstamp_enabled = ctx->time - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry)
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                         sub->tstamp_enabled =
                                 ctx->time - sub->total_time_enabled;
+               }
+       }
  }
  
  /*
@@ -943,9 +917,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
  static void __perf_event_enable(void *info)
  {
         struct perf_event *event = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *leader = event->group_leader;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
         /*
@@ -979,12 +953,10 @@ static void __perf_event_enable(void *info)
         if (!group_can_go_on(event, cpuctx, 1)) {
                 err = -EEXIST;
         } else {
-               perf_disable();
                 if (event == leader)
                         err = group_sched_in(event, cpuctx, ctx);
                 else
                         err = event_sched_in(event, cpuctx, ctx);
-               perf_enable();
         }
  
         if (err) {
@@ -1000,7 +972,7 @@ static void __perf_event_enable(void *info)
                 }
         }
  
- unlock:
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1041,7 +1013,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_ERROR)
                 event->state = PERF_EVENT_STATE_OFF;
  
- retry:
+retry:
         raw_spin_unlock_irq(&ctx->lock);
         task_oncpu_function_call(task, __perf_event_enable, event);
  
@@ -1061,7 +1033,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_OFF)
                 __perf_event_mark_enabled(event, ctx);
  
- out:
+out:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
@@ -1092,26 +1064,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
         struct perf_event *event;
  
         raw_spin_lock(&ctx->lock);
+       perf_pmu_disable(ctx->pmu);
         ctx->is_active = 0;
         if (likely(!ctx->nr_events))
                 goto out;
         update_context_time(ctx);
  
-       perf_disable();
         if (!ctx->nr_active)
-               goto out_enable;
+               goto out;
  
-       if (event_type & EVENT_PINNED)
+       if (event_type & EVENT_PINNED) {
                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
+       }
  
-       if (event_type & EVENT_FLEXIBLE)
+       if (event_type & EVENT_FLEXIBLE) {
                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
-
- out_enable:
-       perf_enable();
- out:
+       }
+out:
+       perf_pmu_enable(ctx->pmu);
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1209,34 +1181,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
         }
  }
  
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-                                struct task_struct *next)
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+                                 struct task_struct *next)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
         struct perf_event_context *next_ctx;
         struct perf_event_context *parent;
+       struct perf_cpu_context *cpuctx;
         int do_switch = 1;
  
-       perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+       if (likely(!ctx))
+               return;
  
-       if (likely(!ctx || !cpuctx->task_ctx))
+       cpuctx = __get_cpu_context(ctx);
+       if (!cpuctx->task_ctx)
                 return;
  
         rcu_read_lock();
         parent = rcu_dereference(ctx->parent_ctx);
-       next_ctx = next->perf_event_ctxp;
+       next_ctx = next->perf_event_ctxp[ctxn];
         if (parent && next_ctx &&
             rcu_dereference(next_ctx->parent_ctx) == parent) {
                 /*
@@ -1255,8 +1218,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                          * XXX do we need a memory barrier of sorts
                          * wrt to rcu_dereference() of perf_event_ctxp
                          */
-                       task->perf_event_ctxp = next_ctx;
-                       next->perf_event_ctxp = ctx;
+                       task->perf_event_ctxp[ctxn] = next_ctx;
+                       next->perf_event_ctxp[ctxn] = ctx;
                         ctx->task = next;
                         next_ctx->task = task;
                         do_switch = 0;
@@ -1274,10 +1237,35 @@ void perf_event_task_sched_out(struct task_struct *task,
         }
  }
  
+#define for_each_task_context_nr(ctxn)                                 \
+       for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void perf_event_task_sched_out(struct task_struct *task,
+                              struct task_struct *next)
+{
+       int ctxn;
+
+       perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+       for_each_task_context_nr(ctxn)
+               perf_event_context_sched_out(task, ctxn, next);
+}
+
  static void task_ctx_sched_out(struct perf_event_context *ctx,
                                enum event_type_t event_type)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         if (!cpuctx->task_ctx)
                 return;
@@ -1350,9 +1338,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 if (event->cpu != -1 && event->cpu != smp_processor_id())
                         continue;
  
-               if (group_can_go_on(event, cpuctx, can_add_hw))
+               if (group_can_go_on(event, cpuctx, can_add_hw)) {
                         if (group_sched_in(event, cpuctx, ctx))
                                 can_add_hw = 0;
+               }
         }
  }
  
@@ -1368,8 +1357,6 @@ ctx_sched_in(struct perf_event_context *ctx,
  
         ctx->timestamp = perf_clock();
  
-       perf_disable();
-
         /*
          * First go through the list and put on any pinned groups
          * in order to give them the best chance of going on.
@@ -1381,8 +1368,7 @@ ctx_sched_in(struct perf_event_context *ctx,
         if (event_type & EVENT_FLEXIBLE)
                 ctx_flexible_sched_in(ctx, cpuctx);
  
-       perf_enable();
- out:
+out:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1394,43 +1380,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
         ctx_sched_in(ctx, cpuctx, event_type);
  }
  
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
                               enum event_type_t event_type)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_cpu_context *cpuctx;
  
-       if (likely(!ctx))
-               return;
+               cpuctx = __get_cpu_context(ctx);
         if (cpuctx->task_ctx == ctx)
                 return;
+
         ctx_sched_in(ctx, cpuctx, event_type);
         cpuctx->task_ctx = ctx;
  }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
  
-       if (likely(!ctx))
-               return;
+void perf_event_context_sched_in(struct perf_event_context *ctx)
+{
+       struct perf_cpu_context *cpuctx;
  
+       cpuctx = __get_cpu_context(ctx);
         if (cpuctx->task_ctx == ctx)
                 return;
  
-       perf_disable();
-
+       perf_pmu_disable(ctx->pmu);
         /*
          * We want to keep the following priority order:
          * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1415,37 @@ void perf_event_task_sched_in(struct task_struct *task)
  
         cpuctx->task_ctx = ctx;
  
-       perf_enable();
+       /*
+        * Since these rotations are per-cpu, we need to ensure the
+        * cpu-context we got scheduled on is actually rotating.
+        */
+       perf_pmu_rotate_start(ctx->pmu);
+       perf_pmu_enable(ctx->pmu);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void perf_event_task_sched_in(struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+       int ctxn;
+
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (likely(!ctx))
+                       continue;
+
+               perf_event_context_sched_in(ctx);
+       }
  }
  
  #define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1525,6 @@ do {                                     \
         return div64_u64(dividend, divisor);
  }
  
-static void perf_event_stop(struct perf_event *event)
-{
-       if (!event->pmu->stop)
-               return event->pmu->disable(event);
-
-       return event->pmu->stop(event);
-}
-
-static int perf_event_start(struct perf_event *event)
-{
-       if (!event->pmu->start)
-               return event->pmu->enable(event);
-
-       return event->pmu->start(event);
-}
-
  static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
  {
         struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1544,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
         hwc->sample_period = sample_period;
  
         if (local64_read(&hwc->period_left) > 8*sample_period) {
-               perf_disable();
-               perf_event_stop(event);
+               event->pmu->stop(event, PERF_EF_UPDATE);
                 local64_set(&hwc->period_left, 0);
-               perf_event_start(event);
-               perf_enable();
+               event->pmu->start(event, PERF_EF_RELOAD);
         }
  }
  
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
  {
         struct perf_event *event;
         struct hw_perf_event *hwc;
@@ -1592,23 +1575,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                  */
                 if (interrupts == MAX_INTERRUPTS) {
                         perf_log_throttle(event, 1);
-                       perf_disable();
-                       event->pmu->unthrottle(event);
-                       perf_enable();
+                       event->pmu->start(event, 0);
                 }
  
                 if (!event->attr.freq || !event->attr.sample_freq)
                         continue;
  
-               perf_disable();
                 event->pmu->read(event);
                 now = local64_read(&event->count);
                 delta = now - hwc->freq_count_stamp;
                 hwc->freq_count_stamp = now;
  
                 if (delta > 0)
-                       perf_adjust_period(event, TICK_NSEC, delta);
-               perf_enable();
+                       perf_adjust_period(event, period, delta);
         }
         raw_spin_unlock(&ctx->lock);
  }
@@ -1626,32 +1605,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
         raw_spin_unlock(&ctx->lock);
  }
  
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
  {
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
-       int rotate = 0;
-
-       if (!atomic_read(&nr_events))
-               return;
+       u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
+       struct perf_event_context *ctx = NULL;
+       int rotate = 0, remove = 1;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-       if (cpuctx->ctx.nr_events &&
-           cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-               rotate = 1;
+       if (cpuctx->ctx.nr_events) {
+               remove = 0;
+               if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                       rotate = 1;
+       }
  
-       ctx = curr->perf_event_ctxp;
-       if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
-               rotate = 1;
+       ctx = cpuctx->task_ctx;
+       if (ctx && ctx->nr_events) {
+               remove = 0;
+               if (ctx->nr_events != ctx->nr_active)
+                       rotate = 1;
+       }
  
-       perf_ctx_adjust_freq(&cpuctx->ctx);
+       perf_pmu_disable(cpuctx->ctx.pmu);
+       perf_ctx_adjust_freq(&cpuctx->ctx, interval);
         if (ctx)
-               perf_ctx_adjust_freq(ctx);
+               perf_ctx_adjust_freq(ctx, interval);
  
         if (!rotate)
-               return;
+               goto done;
  
-       perf_disable();
         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1647,27 @@ void perf_event_task_tick(struct task_struct *curr)
  
         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
-               task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-       perf_enable();
+               task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
+
+done:
+       if (remove)
+               list_del_init(&cpuctx->rotation_list);
+
+       perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
+void perf_event_task_tick(void)
+{
+       struct list_head *head = &__get_cpu_var(rotation_list);
+       struct perf_cpu_context *cpuctx, *tmp;
+
+       WARN_ON(!irqs_disabled());
+
+       list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+               if (cpuctx->jiffies_interval == 1 ||
+                               !(jiffies % cpuctx->jiffies_interval))
+                       perf_rotate_context(cpuctx);
+       }
  }
  
  static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1689,18 @@ static int event_enable_on_exec(struct perf_event *event,
   * Enable all of a task's events that have been marked enable-on-exec.
   * This expects task == current.
   */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
  {
-       struct perf_event_context *ctx;
         struct perf_event *event;
         unsigned long flags;
         int enabled = 0;
         int ret;
  
         local_irq_save(flags);
-       ctx = task->perf_event_ctxp;
         if (!ctx || !ctx->nr_events)
                 goto out;
  
-       __perf_event_task_sched_out(ctx);
+       task_ctx_sched_out(ctx, EVENT_ALL);
  
         raw_spin_lock(&ctx->lock);
  
@@ -1722,8 +1724,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
  
         raw_spin_unlock(&ctx->lock);
  
-       perf_event_task_sched_in(task);
- out:
+       perf_event_context_sched_in(ctx);
+out:
         local_irq_restore(flags);
  }
  
@@ -1732,9 +1734,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
   */
  static void __perf_event_read(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a task context, we need to check whether it is
@@ -1782,112 +1784,357 @@ static u64 perf_event_read(struct perf_event *event)
  }
  
  /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
   */
-static void
-__perf_event_init_context(struct perf_event_context *ctx,
-                           struct task_struct *task)
+
+struct callchain_cpus_entries {
+       struct rcu_head                 rcu_head;
+       struct perf_callchain_entry     *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs)
  {
-       raw_spin_lock_init(&ctx->lock);
-       mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->pinned_groups);
-       INIT_LIST_HEAD(&ctx->flexible_groups);
-       INIT_LIST_HEAD(&ctx->event_list);
-       atomic_set(&ctx->refcount, 1);
-       ctx->task = task;
  }
  
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs)
  {
-       struct perf_event_context *ctx;
-       struct perf_cpu_context *cpuctx;
-       struct task_struct *task;
-       unsigned long flags;
-       int err;
+}
  
-       if (pid == -1 && cpu != -1) {
-               /* Must be root to operate on a CPU event: */
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return ERR_PTR(-EACCES);
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+       struct callchain_cpus_entries *entries;
+       int cpu;
  
-               if (cpu < 0 || cpu >= nr_cpumask_bits)
-                       return ERR_PTR(-EINVAL);
+       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
  
-               /*
-                * We could be clever and allow to attach a event to an
-                * offline CPU and activate it when the CPU comes up, but
-                * that's for later.
-                */
-               if (!cpu_online(cpu))
-                       return ERR_PTR(-ENODEV);
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
  
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
-               get_ctx(ctx);
+       kfree(entries);
+}
  
-               return ctx;
-       }
+static void release_callchain_buffers(void)
+{
+       struct callchain_cpus_entries *entries;
  
-       rcu_read_lock();
-       if (!pid)
-               task = current;
-       else
-               task = find_task_by_vpid(pid);
-       if (task)
-               get_task_struct(task);
-       rcu_read_unlock();
+       entries = callchain_cpus_entries;
+       rcu_assign_pointer(callchain_cpus_entries, NULL);
+       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
  
-       if (!task)
-               return ERR_PTR(-ESRCH);
+static int alloc_callchain_buffers(void)
+{
+       int cpu;
+       int size;
+       struct callchain_cpus_entries *entries;
  
         /*
-        * Can't attach events to a dying task.
+        * We can't use the percpu allocation API for data that can be
+        * accessed from NMI. Use a temporary manual per cpu allocation
+        * until that gets sorted out.
          */
-       err = -ESRCH;
-       if (task->flags & PF_EXITING)
-               goto errout;
+       size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+               num_possible_cpus();
  
-       /* Reuse ptrace permission checks for now. */
-       err = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
-               goto errout;
+       entries = kzalloc(size, GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
  
- retry:
-       ctx = perf_lock_task_context(task, &flags);
-       if (ctx) {
-               unclone_ctx(ctx);
-               raw_spin_unlock_irqrestore(&ctx->lock, flags);
-       }
+       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
  
-       if (!ctx) {
-               ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-               err = -ENOMEM;
-               if (!ctx)
-                       goto errout;
-               __perf_event_init_context(ctx, task);
-               get_ctx(ctx);
-               if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
-                       /*
-                        * We raced with some other task; use
-                        * the context they set.
-                        */
-                       kfree(ctx);
-                       goto retry;
-               }
-               get_task_struct(task);
+       for_each_possible_cpu(cpu) {
+               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                        cpu_to_node(cpu));
+               if (!entries->cpu_entries[cpu])
+                       goto fail;
         }
  
-       put_task_struct(task);
-       return ctx;
+       rcu_assign_pointer(callchain_cpus_entries, entries);
  
- errout:
-       put_task_struct(task);
-       return ERR_PTR(err);
-}
+       return 0;
  
-static void perf_event_free_filter(struct perf_event *event);
+fail:
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+       kfree(entries);
  
-static void free_event_rcu(struct rcu_head *head)
+       return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+       int err = 0;
+       int count;
+
+       mutex_lock(&callchain_mutex);
+
+       count = atomic_inc_return(&nr_callchain_events);
+       if (WARN_ON_ONCE(count < 1)) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       if (count > 1) {
+               /* If the allocation failed, give up */
+               if (!callchain_cpus_entries)
+                       err = -ENOMEM;
+               goto exit;
+       }
+
+       err = alloc_callchain_buffers();
+       if (err)
+               release_callchain_buffers();
+exit:
+       mutex_unlock(&callchain_mutex);
+
+       return err;
+}
+
+static void put_callchain_buffers(void)
+{
+       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+               release_callchain_buffers();
+               mutex_unlock(&callchain_mutex);
+       }
+}
+
+static int get_recursion_context(int *recursion)
+{
+       int rctx;
+
+       if (in_nmi())
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (recursion[rctx])
+               return -1;
+
+       recursion[rctx]++;
+       barrier();
+
+       return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+       barrier();
+       recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+       int cpu;
+       struct callchain_cpus_entries *entries;
+
+       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+       if (*rctx == -1)
+               return NULL;
+
+       entries = rcu_dereference(callchain_cpus_entries);
+       if (!entries)
+               return NULL;
+
+       cpu = smp_processor_id();
+
+       return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       int rctx;
+       struct perf_callchain_entry *entry;
+
+
+       entry = get_callchain_entry(&rctx);
+       if (rctx == -1)
+               return NULL;
+
+       if (!entry)
+               goto exit_put;
+
+       entry->nr = 0;
+
+       if (!user_mode(regs)) {
+               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(entry, regs);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               perf_callchain_store(entry, PERF_CONTEXT_USER);
+               perf_callchain_user(entry, regs);
+       }
+
+exit_put:
+       put_callchain_entry(rctx);
+
+       return entry;
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
+{
+       raw_spin_lock_init(&ctx->lock);
+       mutex_init(&ctx->mutex);
+       INIT_LIST_HEAD(&ctx->pinned_groups);
+       INIT_LIST_HEAD(&ctx->flexible_groups);
+       INIT_LIST_HEAD(&ctx->event_list);
+       atomic_set(&ctx->refcount, 1);
+}
+
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+
+       ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+       if (!ctx)
+               return NULL;
+
+       __perf_event_init_context(ctx);
+       if (task) {
+               ctx->task = task;
+               get_task_struct(task);
+       }
+       ctx->pmu = pmu;
+
+       return ctx;
+}
+
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+       struct task_struct *task;
+       int err;
+
+       rcu_read_lock();
+       if (!vpid)
+               task = current;
+       else
+               task = find_task_by_vpid(vpid);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       if (!task)
+               return ERR_PTR(-ESRCH);
+
+       /*
+        * Can't attach events to a dying task.
+        */
+       err = -ESRCH;
+       if (task->flags & PF_EXITING)
+               goto errout;
+
+       /* Reuse ptrace permission checks for now. */
+       err = -EACCES;
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto errout;
+
+       return task;
+errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
+
+}
+
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+       struct perf_event_context *ctx;
+       struct perf_cpu_context *cpuctx;
+       unsigned long flags;
+       int ctxn, err;
+
+       if (!task && cpu != -1) {
+               /* Must be root to operate on a CPU event: */
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return ERR_PTR(-EACCES);
+
+               if (cpu < 0 || cpu >= nr_cpumask_bits)
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * We could be clever and allow to attach a event to an
+                * offline CPU and activate it when the CPU comes up, but
+                * that's for later.
+                */
+               if (!cpu_online(cpu))
+                       return ERR_PTR(-ENODEV);
+
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+               get_ctx(ctx);
+
+               return ctx;
+       }
+
+       err = -EINVAL;
+       ctxn = pmu->task_ctx_nr;
+       if (ctxn < 0)
+               goto errout;
+
+retry:
+       ctx = perf_lock_task_context(task, ctxn, &flags);
+       if (ctx) {
+               unclone_ctx(ctx);
+               raw_spin_unlock_irqrestore(&ctx->lock, flags);
+       }
+
+       if (!ctx) {
+               ctx = alloc_perf_context(pmu, task);
+               err = -ENOMEM;
+               if (!ctx)
+                       goto errout;
+
+               get_ctx(ctx);
+
+               if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
+                       /*
+                        * We raced with some other task; use
+                        * the context they set.
+                        */
+                       put_task_struct(task);
+                       kfree(ctx);
+                       goto retry;
+               }
+       }
+
+       put_task_struct(task);
+       return ctx;
+
+errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
+}
+
+static void perf_event_free_filter(struct perf_event *event);
+
+static void free_event_rcu(struct rcu_head *head)
  {
         struct perf_event *event;
  
@@ -1913,6 +2160,8 @@ static void free_event(struct perf_event *event)
                         atomic_dec(&nr_comm_events);
                 if (event->attr.task)
                         atomic_dec(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
         }
  
         if (event->buffer) {
@@ -1923,7 +2172,9 @@ static void free_event(struct perf_event *event)
         if (event->destroy)
                 event->destroy(event);
  
-       put_ctx(event->ctx);
+       if (event->ctx)
+               put_ctx(event->ctx);
+
         call_rcu(&event->rcu_head, free_event_rcu);
  }
  
@@ -2344,6 +2595,9 @@ int perf_event_task_disable(void)
  
  static int perf_event_index(struct perf_event *event)
  {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return 0;
  
@@ -2955,16 +3209,6 @@ void perf_event_do_pending(void)
         __perf_pending_run();
  }
  
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       return NULL;
-}
-
-
  /*
   * We assume there is only KVM supporting the callbacks.
   * Later on, we might change it to a list if there is
@@ -3071,7 +3315,7 @@ again:
         if (handle->wakeup != local_read(&buffer->wakeup))
                 perf_output_wakeup(handle);
  
- out:
+out:
         preempt_enable();
  }
  
@@ -3459,14 +3703,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
         struct perf_output_handle handle;
         struct perf_event_header header;
  
+       /* protect the callchain buffers */
+       rcu_read_lock();
+
         perf_prepare_sample(&header, data, event, regs);
  
         if (perf_output_begin(&handle, event, header.size, nmi, 1))
-               return;
+               goto exit;
  
         perf_output_sample(&handle, &header, data, event);
  
         perf_output_end(&handle);
+
+exit:
+       rcu_read_unlock();
  }
  
  /*
@@ -3580,16 +3830,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
  static void perf_event_task_event(struct perf_task_event *task_event)
  {
         struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx = task_event->task_ctx;
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int ctxn;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_task_ctx(&cpuctx->ctx, task_event);
-       if (!ctx)
-               ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_task_ctx(ctx, task_event);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_task_ctx(&cpuctx->ctx, task_event);
+
+               ctx = task_event->task_ctx;
+               if (!ctx) {
+                       ctxn = pmu->task_ctx_nr;
+                       if (ctxn < 0)
+                               goto next;
+                       ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               }
+               if (ctx)
+                       perf_event_task_ctx(ctx, task_event);
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  }
  
@@ -3694,8 +3955,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
  {
         struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       unsigned int size;
         char comm[TASK_COMM_LEN];
+       unsigned int size;
+       struct pmu *pmu;
+       int ctxn;
  
         memset(comm, 0, sizeof(comm));
         strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3707,21 +3970,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_comm_ctx(ctx, comm_event);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+
+               ctxn = pmu->task_ctx_nr;
+               if (ctxn < 0)
+                       goto next;
+
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx)
+                       perf_event_comm_ctx(ctx, comm_event);
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  }
  
  void perf_event_comm(struct task_struct *task)
  {
         struct perf_comm_event comm_event;
+       struct perf_event_context *ctx;
+       int ctxn;
+
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (!ctx)
+                       continue;
  
-       if (task->perf_event_ctxp)
-               perf_event_enable_on_exec(task);
+               perf_event_enable_on_exec(ctx);
+       }
  
         if (!atomic_read(&nr_comm_events))
                 return;
@@ -3823,6 +4101,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
         char tmp[16];
         char *buf = NULL;
         const char *name;
+       struct pmu *pmu;
+       int ctxn;
  
         memset(tmp, 0, sizeof(tmp));
  
@@ -3875,12 +4155,23 @@ got_name:
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
+                                       vma->vm_flags & VM_EXEC);
+
+               ctxn = pmu->task_ctx_nr;
+               if (ctxn < 0)
+                       goto next;
+
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx) {
+                       perf_event_mmap_ctx(ctx, mmap_event,
+                                       vma->vm_flags & VM_EXEC);
+               }
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  
         kfree(buf);
@@ -3962,8 +4253,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
         struct hw_perf_event *hwc = &event->hw;
         int ret = 0;
  
-       throttle = (throttle && event->pmu->unthrottle != NULL);
-
         if (!throttle) {
                 hwc->interrupts++;
         } else {
@@ -4031,6 +4320,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
   * Generic software event infrastructure
   */
  
+struct swevent_htable {
+       struct swevent_hlist            *swevent_hlist;
+       struct mutex                    hlist_mutex;
+       int                             hlist_refcount;
+
+       /* Recursion avoidance in each contexts */
+       int                             recursion[PERF_NR_CONTEXTS];
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
  /*
   * We directly increment event->count and keep a second value in
   * event->hw.period_left to count intervals. This period event
@@ -4088,7 +4388,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
         }
  }
  
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                                int nmi, struct perf_sample_data *data,
                                struct pt_regs *regs)
  {
@@ -4114,6 +4414,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
  static int perf_exclude_event(struct perf_event *event,
                               struct pt_regs *regs)
  {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
         if (regs) {
                 if (event->attr.exclude_user && user_mode(regs))
                         return 1;
@@ -4160,11 +4463,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
  
  /* For the read side: events when they trigger */
  static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
  {
         struct swevent_hlist *hlist;
  
-       hlist = rcu_dereference(ctx->swevent_hlist);
+       hlist = rcu_dereference(swhash->swevent_hlist);
         if (!hlist)
                 return NULL;
  
@@ -4173,7 +4476,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
  
  /* For the event head insertion and removal in the hlist */
  static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
  {
         struct swevent_hlist *hlist;
         u32 event_id = event->attr.config;
@@ -4184,7 +4487,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
          * and release. Which makes the protected version suitable here.
          * The context lock guarantees that.
          */
-       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+       hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                           lockdep_is_held(&event->ctx->lock));
         if (!hlist)
                 return NULL;
@@ -4197,23 +4500,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs)
  {
-       struct perf_cpu_context *cpuctx;
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
         struct perf_event *event;
         struct hlist_node *node;
         struct hlist_head *head;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-
         rcu_read_lock();
-
-       head = find_swevent_head_rcu(cpuctx, type, event_id);
-
+       head = find_swevent_head_rcu(swhash, type, event_id);
         if (!head)
                 goto end;
  
         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                 if (perf_swevent_match(event, type, event_id, data, regs))
-                       perf_swevent_add(event, nr, nmi, data, regs);
+                       perf_swevent_event(event, nr, nmi, data, regs);
         }
  end:
         rcu_read_unlock();
@@ -4221,33 +4520,17 @@ end:
  
  int perf_swevent_get_recursion_context(void)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       int rctx;
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
  
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (cpuctx->recursion[rctx])
-               return -1;
-
-       cpuctx->recursion[rctx]++;
-       barrier();
-
-       return rctx;
+       return get_recursion_context(swhash->recursion);
  }
  EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
  
  void inline perf_swevent_put_recursion_context(int rctx)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       barrier();
-       cpuctx->recursion[rctx]--;
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+
+       put_recursion_context(swhash->recursion, rctx);
  }
  
  void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4273,20 +4556,20 @@ static void perf_swevent_read(struct perf_event *event)
  {
  }
  
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
  {
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
         struct hw_perf_event *hwc = &event->hw;
-       struct perf_cpu_context *cpuctx;
         struct hlist_head *head;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-
         if (hwc->sample_period) {
                 hwc->last_period = hwc->sample_period;
                 perf_swevent_set_period(event);
         }
  
-       head = find_swevent_head(cpuctx, event);
+       hwc->state = !(flags & PERF_EF_START);
+
+       head = find_swevent_head(swhash, event);
         if (WARN_ON_ONCE(!head))
                 return -EINVAL;
  
@@ -4295,202 +4578,27 @@ static int perf_swevent_enable(struct perf_event *event)
         return 0;
  }
  
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
  {
         hlist_del_rcu(&event->hlist_entry);
  }
  
-static void perf_swevent_void(struct perf_event *event)
-{
-}
-
-static int perf_swevent_int(struct perf_event *event)
-{
-       return 0;
-}
-
-static const struct pmu perf_ops_generic = {
-       .enable         = perf_swevent_enable,
-       .disable        = perf_swevent_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
-       .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_event *event;
-       u64 period;
-
-       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-       event->pmu->read(event);
-
-       perf_sample_data_init(&data, 0);
-       data.period = event->hw.last_period;
-       regs = get_irq_regs();
-
-       if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
-                       if (perf_event_overflow(event, 0, &data, regs))
-                               ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, event->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swevent_hrtimer;
-       if (hwc->sample_period) {
-               u64 period;
-
-               if (hwc->remaining) {
-                       if (hwc->remaining < 0)
-                               period = 10000;
-                       else
-                               period = hwc->remaining;
-                       hwc->remaining = 0;
-               } else {
-                       period = max_t(u64, 10000, hwc->sample_period);
-               }
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->sample_period) {
-               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-               hwc->remaining = ktime_to_ns(remaining);
-
-               hrtimer_cancel(&hwc->hrtimer);
-       }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
  {
-       int cpu = raw_smp_processor_id();
-       s64 prev;
-       u64 now;
-
-       now = cpu_clock(cpu);
-       prev = local64_xchg(&event->hw.prev_count, now);
-       local64_add(now - prev, &event->count);
+       event->hw.state = 0;
  }
  
-static int cpu_clock_perf_event_enable(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
  {
-       struct hw_perf_event *hwc = &event->hw;
-       int cpu = raw_smp_processor_id();
-
-       local64_set(&hwc->prev_count, cpu_clock(cpu));
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
+       event->hw.state = PERF_HES_STOPPED;
  }
  
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-       cpu_clock_perf_event_update(event);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-       .enable         = cpu_clock_perf_event_enable,
-       .disable        = cpu_clock_perf_event_disable,
-       .read           = cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = local64_xchg(&event->hw.prev_count, now);
-       delta = now - prev;
-       local64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 now;
-
-       now = event->ctx->time;
-
-       local64_set(&hwc->prev_count, now);
-
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       task_clock_perf_event_update(event, event->ctx->time);
-
-}
-
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-       u64 time;
-
-       if (!in_nmi()) {
-               update_context_time(event->ctx);
-               time = event->ctx->time;
-       } else {
-               u64 now = perf_clock();
-               u64 delta = now - event->ctx->timestamp;
-               time = event->ctx->time + delta;
-       }
-
-       task_clock_perf_event_update(event, time);
-}
-
-static const struct pmu perf_ops_task_clock = {
-       .enable         = task_clock_perf_event_enable,
-       .disable        = task_clock_perf_event_disable,
-       .read           = task_clock_perf_event_read,
-};
-
  /* Deref the hlist from the update side */
  static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
  {
-       return rcu_dereference_protected(cpuctx->swevent_hlist,
-                                        lockdep_is_held(&cpuctx->hlist_mutex));
+       return rcu_dereference_protected(swhash->swevent_hlist,
+                                        lockdep_is_held(&swhash->hlist_mutex));
  }
  
  static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4501,27 +4609,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
         kfree(hlist);
  }
  
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
  {
-       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+       struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
  
         if (!hlist)
                 return;
  
-       rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+       rcu_assign_pointer(swhash->swevent_hlist, NULL);
         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
  }
  
  static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       mutex_lock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
  
-       if (!--cpuctx->hlist_refcount)
-               swevent_hlist_release(cpuctx);
+       if (!--swhash->hlist_refcount)
+               swevent_hlist_release(swhash);
  
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_unlock(&swhash->hlist_mutex);
  }
  
  static void swevent_hlist_put(struct perf_event *event)
@@ -4539,12 +4647,12 @@ static void swevent_hlist_put(struct perf_event *event)
  
  static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
         int err = 0;
  
-       mutex_lock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
  
-       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                 struct swevent_hlist *hlist;
  
                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4552,11 +4660,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                         err = -ENOMEM;
                         goto exit;
                 }
-               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
         }
-       cpuctx->hlist_refcount++;
- exit:
-       mutex_unlock(&cpuctx->hlist_mutex);
+       swhash->hlist_refcount++;
+exit:
+       mutex_unlock(&swhash->hlist_mutex);
  
         return err;
  }
@@ -4580,7 +4688,7 @@ static int swevent_hlist_get(struct perf_event *event)
         put_online_cpus();
  
         return 0;
- fail:
+fail:
         for_each_possible_cpu(cpu) {
                 if (cpu == failed_cpu)
                         break;
@@ -4591,17 +4699,64 @@ static int swevent_hlist_get(struct perf_event *event)
         return err;
  }
  
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
  
-static const struct pmu perf_ops_tracepoint = {
-       .enable         = perf_trace_enable,
-       .disable        = perf_trace_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
+       WARN_ON(event->parent);
+
+       atomic_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+       int event_id = event->attr.config;
+
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+       case PERF_COUNT_SW_TASK_CLOCK:
+               return -ENOENT;
+
+       default:
+               break;
+       }
+
+       if (event_id > PERF_COUNT_SW_MAX)
+               return -ENOENT;
+
+       if (!event->parent) {
+               int err;
+
+               err = swevent_hlist_get(event);
+               if (err)
+                       return err;
+
+               atomic_inc(&perf_swevent_enabled[event_id]);
+               event->destroy = sw_perf_event_destroy;
+       }
+
+       return 0;
+}
+
+static struct pmu perf_swevent = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = perf_swevent_init,
+       .add            = perf_swevent_add,
+       .del            = perf_swevent_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void,
  };
  
+#ifdef CONFIG_EVENT_TRACING
+
  static int perf_tp_filter_match(struct perf_event *event,
                                 struct perf_sample_data *data)
  {
@@ -4645,7 +4800,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
  
         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                 if (perf_tp_event_match(event, &data, regs))
-                       perf_swevent_add(event, count, 1, &data, regs);
+                       perf_swevent_event(event, count, 1, &data, regs);
         }
  
         perf_swevent_put_recursion_context(rctx);
@@ -4657,10 +4812,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
         perf_trace_destroy(event);
  }
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
  {
         int err;
  
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+
         /*
          * Raw tracepoint data is a severe data leak, only allow root to
          * have these.
@@ -4668,15 +4826,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                         perf_paranoid_tracepoint_raw() &&
                         !capable(CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
+               return -EPERM;
  
         err = perf_trace_init(event);
         if (err)
-               return NULL;
+               return err;
  
         event->destroy = tp_perf_event_destroy;
  
-       return &perf_ops_tracepoint;
+       return 0;
+}
+
+static struct pmu perf_tracepoint = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+       perf_pmu_register(&perf_tracepoint);
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4693,133 +4867,416 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  
         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
  
-       kfree(filter_str);
-       return ret;
+       kfree(filter_str);
+       return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+       ftrace_profile_free_filter(event);
+}
+
+#else
+
+static inline void perf_tp_register(void)
+{
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+       struct perf_sample_data sample;
+       struct pt_regs *regs = data;
+
+       perf_sample_data_init(&sample, bp->attr.bp_addr);
+
+       if (!bp->hw.state && !perf_exclude_event(bp, regs))
+               perf_swevent_event(bp, 1, 1, &sample, regs);
+}
+#endif
+
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+{
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
+
+       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+       event->pmu->read(event);
+
+       perf_sample_data_init(&data, 0);
+       data.period = event->hw.last_period;
+       regs = get_irq_regs();
+
+       if (regs && !perf_exclude_event(event, regs)) {
+               if (!(event->attr.exclude_idle && current->pid == 0))
+                       if (perf_event_overflow(event, 0, &data, regs))
+                               ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
+}
+
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               s64 period = local64_read(&hwc->period_left);
+
+               if (period) {
+                       if (period < 0)
+                               period = 10000;
+
+                       local64_set(&hwc->period_left, 0);
+               } else {
+                       period = max_t(u64, 10000, hwc->sample_period);
+               }
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL_PINNED, 0);
+       }
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->sample_period) {
+               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+               local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+               hrtimer_cancel(&hwc->hrtimer);
+       }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+       s64 prev;
+       u64 now;
+
+       now = local_clock();
+       prev = local64_xchg(&event->hw.prev_count, now);
+       local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, local_clock());
+       perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               cpu_clock_event_start(event, flags);
+
+       return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+       cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+               return -ENOENT;
+
+       return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = cpu_clock_event_init,
+       .add            = cpu_clock_event_add,
+       .del            = cpu_clock_event_del,
+       .start          = cpu_clock_event_start,
+       .stop           = cpu_clock_event_stop,
+       .read           = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = local64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, event->ctx->time);
+       perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               task_clock_event_start(event, flags);
+
+       return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+       task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(event->ctx);
+               time = event->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - event->ctx->timestamp;
+               time = event->ctx->time + delta;
+       }
+
+       task_clock_event_update(event, time);
  }
  
-static void perf_event_free_filter(struct perf_event *event)
+static int task_clock_event_init(struct perf_event *event)
  {
-       ftrace_profile_free_filter(event);
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+               return -ENOENT;
+
+       return 0;
  }
  
-#else
+static struct pmu perf_task_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = task_clock_event_init,
+       .add            = task_clock_event_add,
+       .del            = task_clock_event_del,
+       .start          = task_clock_event_start,
+       .stop           = task_clock_event_stop,
+       .read           = task_clock_event_read,
+};
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static void perf_pmu_nop_void(struct pmu *pmu)
  {
-       return NULL;
  }
  
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+static int perf_pmu_nop_int(struct pmu *pmu)
  {
-       return -ENOENT;
+       return 0;
  }
  
-static void perf_event_free_filter(struct perf_event *event)
+static void perf_pmu_start_txn(struct pmu *pmu)
  {
+       perf_pmu_disable(pmu);
  }
  
-#endif /* CONFIG_EVENT_TRACING */
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+       perf_pmu_enable(pmu);
+       return 0;
+}
  
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+static void perf_pmu_cancel_txn(struct pmu *pmu)
  {
-       release_bp_slot(event);
+       perf_pmu_enable(pmu);
  }
  
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
  {
-       int err;
+       struct pmu *pmu;
  
-       err = register_perf_hw_breakpoint(bp);
-       if (err)
-               return ERR_PTR(err);
+       if (ctxn < 0)
+               return NULL;
  
-       bp->destroy = bp_perf_event_destroy;
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (pmu->task_ctx_nr == ctxn)
+                       return pmu->pmu_cpu_context;
+       }
  
-       return &perf_ops_bp;
+       return NULL;
  }
  
-void perf_bp_event(struct perf_event *bp, void *data)
+static void free_pmu_context(void * __percpu cpu_context)
  {
-       struct perf_sample_data sample;
-       struct pt_regs *regs = data;
+       struct pmu *pmu;
  
-       perf_sample_data_init(&sample, bp->attr.bp_addr);
+       mutex_lock(&pmus_lock);
+       /*
+        * Like a real lame refcount.
+        */
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (pmu->pmu_cpu_context == cpu_context)
+                       goto out;
+       }
  
-       if (!perf_exclude_event(bp, regs))
-               perf_swevent_add(bp, 1, 1, &sample, regs);
-}
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
-{
-       return NULL;
+       free_percpu(cpu_context);
+out:
+       mutex_unlock(&pmus_lock);
  }
  
-void perf_bp_event(struct perf_event *bp, void *regs)
+int perf_pmu_register(struct pmu *pmu)
  {
-}
-#endif
+       int cpu, ret;
  
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+       mutex_lock(&pmus_lock);
+       ret = -ENOMEM;
+       pmu->pmu_disable_count = alloc_percpu(int);
+       if (!pmu->pmu_disable_count)
+               goto unlock;
  
-static void sw_perf_event_destroy(struct perf_event *event)
-{
-       u64 event_id = event->attr.config;
+       pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+       if (pmu->pmu_cpu_context)
+               goto got_cpu_context;
  
-       WARN_ON(event->parent);
+       pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+       if (!pmu->pmu_cpu_context)
+               goto free_pdc;
  
-       atomic_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+       for_each_possible_cpu(cpu) {
+               struct perf_cpu_context *cpuctx;
+
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               __perf_event_init_context(&cpuctx->ctx);
+               cpuctx->ctx.type = cpu_context;
+               cpuctx->ctx.pmu = pmu;
+               cpuctx->jiffies_interval = 1;
+               INIT_LIST_HEAD(&cpuctx->rotation_list);
+       }
+
+got_cpu_context:
+       if (!pmu->start_txn) {
+               if (pmu->pmu_enable) {
+                       /*
+                        * If we have pmu_enable/pmu_disable calls, install
+                        * transaction stubs that use that to try and batch
+                        * hardware accesses.
+                        */
+                       pmu->start_txn  = perf_pmu_start_txn;
+                       pmu->commit_txn = perf_pmu_commit_txn;
+                       pmu->cancel_txn = perf_pmu_cancel_txn;
+               } else {
+                       pmu->start_txn  = perf_pmu_nop_void;
+                       pmu->commit_txn = perf_pmu_nop_int;
+                       pmu->cancel_txn = perf_pmu_nop_void;
+               }
+       }
+
+       if (!pmu->pmu_enable) {
+               pmu->pmu_enable  = perf_pmu_nop_void;
+               pmu->pmu_disable = perf_pmu_nop_void;
+       }
+
+       list_add_rcu(&pmu->entry, &pmus);
+       ret = 0;
+unlock:
+       mutex_unlock(&pmus_lock);
+
+       return ret;
+
+free_pdc:
+       free_percpu(pmu->pmu_disable_count);
+       goto unlock;
  }
  
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+void perf_pmu_unregister(struct pmu *pmu)
  {
-       const struct pmu *pmu = NULL;
-       u64 event_id = event->attr.config;
+       mutex_lock(&pmus_lock);
+       list_del_rcu(&pmu->entry);
+       mutex_unlock(&pmus_lock);
  
         /*
-        * Software events (currently) can't in general distinguish
-        * between user, kernel and hypervisor events.
-        * However, context switches and cpu migrations are considered
-        * to be kernel events, and page faults are never hypervisor
-        * events.
+        * We dereference the pmu list under both SRCU and regular RCU, so
+        * synchronize against both of those.
          */
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-               pmu = &perf_ops_cpu_clock;
+       synchronize_srcu(&pmus_srcu);
+       synchronize_rcu();
  
-               break;
-       case PERF_COUNT_SW_TASK_CLOCK:
-               /*
-                * If the user instantiates this as a per-cpu event,
-                * use the cpu_clock event instead.
-                */
-               if (event->ctx->task)
-                       pmu = &perf_ops_task_clock;
-               else
-                       pmu = &perf_ops_cpu_clock;
+       free_percpu(pmu->pmu_disable_count);
+       free_pmu_context(pmu->pmu_cpu_context);
+}
  
-               break;
-       case PERF_COUNT_SW_PAGE_FAULTS:
-       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-       case PERF_COUNT_SW_CONTEXT_SWITCHES:
-       case PERF_COUNT_SW_CPU_MIGRATIONS:
-       case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-       case PERF_COUNT_SW_EMULATION_FAULTS:
-               if (!event->parent) {
-                       int err;
-
-                       err = swevent_hlist_get(event);
-                       if (err)
-                               return ERR_PTR(err);
+struct pmu *perf_init_event(struct perf_event *event)
+{
+       struct pmu *pmu = NULL;
+       int idx;
  
-                       atomic_inc(&perf_swevent_enabled[event_id]);
-                       event->destroy = sw_perf_event_destroy;
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               int ret = pmu->event_init(event);
+               if (!ret)
+                       goto unlock;
+
+               if (ret != -ENOENT) {
+                       pmu = ERR_PTR(ret);
+                       goto unlock;
                 }
-               pmu = &perf_ops_generic;
-               break;
         }
+       pmu = ERR_PTR(-ENOENT);
+unlock:
+       srcu_read_unlock(&pmus_srcu, idx);
  
         return pmu;
  }
@@ -4828,20 +5285,17 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
   * Allocate and initialize a event structure
   */
  static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
-                  int cpu,
-                  struct perf_event_context *ctx,
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
                    struct perf_event *group_leader,
                    struct perf_event *parent_event,
-                  perf_overflow_handler_t overflow_handler,
-                  gfp_t gfpflags)
+                  perf_overflow_handler_t overflow_handler)
  {
-       const struct pmu *pmu;
+       struct pmu *pmu;
         struct perf_event *event;
         struct hw_perf_event *hwc;
         long err;
  
-       event = kzalloc(sizeof(*event), gfpflags);
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
                 return ERR_PTR(-ENOMEM);
  
@@ -4866,7 +5320,6 @@ perf_event_alloc(struct perf_event_attr *attr,
         event->attr             = *attr;
         event->group_leader     = group_leader;
         event->pmu              = NULL;
-       event->ctx              = ctx;
         event->oncpu            = -1;
  
         event->parent           = parent_event;
@@ -4900,29 +5353,8 @@ perf_event_alloc(struct perf_event_attr *attr,
         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                 goto done;
  
-       switch (attr->type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               pmu = hw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_SOFTWARE:
-               pmu = sw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_TRACEPOINT:
-               pmu = tp_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_BREAKPOINT:
-               pmu = bp_perf_event_init(event);
-               break;
-
+       pmu = perf_init_event(event);
  
-       default:
-               break;
-       }
  done:
         err = 0;
         if (!pmu)
@@ -4947,6 +5379,13 @@ done:
                         atomic_inc(&nr_comm_events);
                 if (event->attr.task)
                         atomic_inc(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                       err = get_callchain_buffers();
+                       if (err) {
+                               free_event(event);
+                               return ERR_PTR(err);
+                       }
+               }
         }
  
         return event;
@@ -5094,12 +5533,16 @@ SYSCALL_DEFINE5(perf_event_open,
                 struct perf_event_attr __user *, attr_uptr,
                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  {
-       struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+       struct perf_event *group_leader = NULL, *output_event = NULL;
+       struct perf_event *event, *sibling;
         struct perf_event_attr attr;
         struct perf_event_context *ctx;
         struct file *event_file = NULL;
         struct file *group_file = NULL;
+       struct task_struct *task = NULL;
+       struct pmu *pmu;
         int event_fd;
+       int move_group = 0;
         int fput_needed = 0;
         int err;
  
@@ -5125,20 +5568,11 @@ SYSCALL_DEFINE5(perf_event_open,
         if (event_fd < 0)
                 return event_fd;
  
-       /*
-        * Get the target context (task or percpu):
-        */
-       ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx)) {
-               err = PTR_ERR(ctx);
-               goto err_fd;
-       }
-
         if (group_fd != -1) {
                 group_leader = perf_fget_light(group_fd, &fput_needed);
                 if (IS_ERR(group_leader)) {
                         err = PTR_ERR(group_leader);
-                       goto err_put_context;
+                       goto err_fd;
                 }
                 group_file = group_leader->filp;
                 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5147,6 +5581,53 @@ SYSCALL_DEFINE5(perf_event_open,
                         group_leader = NULL;
         }
  
+       event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL);
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
+               goto err_fd;
+       }
+
+       /*
+        * Special case software events and allow them to be part of
+        * any hardware group.
+        */
+       pmu = event->pmu;
+
+       if (group_leader &&
+           (is_software_event(event) != is_software_event(group_leader))) {
+               if (is_software_event(event)) {
+                       /*
+                        * If event and group_leader are not both a software
+                        * event, and event is, then group leader is not.
+                        *
+                        * Allow the addition of software events to !software
+                        * groups, this is safe because software events never
+                        * fail to schedule.
+                        */
+                       pmu = group_leader->pmu;
+               } else if (is_software_event(group_leader) &&
+                          (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                       /*
+                        * In case the group is a pure software group, and we
+                        * try to add a hardware event, move the whole group to
+                        * the hardware context.
+                        */
+                       move_group = 1;
+               }
+       }
+
+       if (pid != -1)
+               task = find_lively_task_by_vpid(pid);
+
+       /*
+        * Get the target context (task or percpu):
+        */
+       ctx = find_get_context(pmu, task, cpu);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_group_fd;
+       }
+
         /*
          * Look up the group leader (we will attach this event to it):
          */
@@ -5158,42 +5639,66 @@ SYSCALL_DEFINE5(perf_event_open,
                  * becoming part of another group-sibling):
                  */
                 if (group_leader->group_leader != group_leader)
-                       goto err_put_context;
+                       goto err_context;
                 /*
                  * Do not allow to attach to a group in a different
                  * task or CPU context:
                  */
-               if (group_leader->ctx != ctx)
-                       goto err_put_context;
+               if (move_group) {
+                       if (group_leader->ctx->type != ctx->type)
+                               goto err_context;
+               } else {
+                       if (group_leader->ctx != ctx)
+                               goto err_context;
+               }
+
                 /*
                  * Only a group leader can be exclusive or pinned
                  */
                 if (attr.exclusive || attr.pinned)
-                       goto err_put_context;
-       }
-
-       event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                    NULL, NULL, GFP_KERNEL);
-       if (IS_ERR(event)) {
-               err = PTR_ERR(event);
-               goto err_put_context;
+                       goto err_context;
         }
  
         if (output_event) {
                 err = perf_event_set_output(event, output_event);
                 if (err)
-                       goto err_free_put_context;
+                       goto err_context;
         }
  
         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
         if (IS_ERR(event_file)) {
                 err = PTR_ERR(event_file);
-               goto err_free_put_context;
+               goto err_context;
+       }
+
+       if (move_group) {
+               struct perf_event_context *gctx = group_leader->ctx;
+
+               mutex_lock(&gctx->mutex);
+               perf_event_remove_from_context(group_leader);
+               list_for_each_entry(sibling, &group_leader->sibling_list,
+                                   group_entry) {
+                       perf_event_remove_from_context(sibling);
+                       put_ctx(gctx);
+               }
+               mutex_unlock(&gctx->mutex);
+               put_ctx(gctx);
         }
  
         event->filp = event_file;
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
+
+       if (move_group) {
+               perf_install_in_context(ctx, group_leader, cpu);
+               get_ctx(ctx);
+               list_for_each_entry(sibling, &group_leader->sibling_list,
+                                   group_entry) {
+                       perf_install_in_context(ctx, sibling, cpu);
+                       get_ctx(ctx);
+               }
+       }
+
         perf_install_in_context(ctx, event, cpu);
         ++ctx->generation;
         mutex_unlock(&ctx->mutex);
@@ -5214,11 +5719,11 @@ SYSCALL_DEFINE5(perf_event_open,
         fd_install(event_fd, event_file);
         return event_fd;
  
-err_free_put_context:
-       free_event(event);
-err_put_context:
-       fput_light(group_file, fput_needed);
+err_context:
         put_ctx(ctx);
+err_group_fd:
+       fput_light(group_file, fput_needed);
+       free_event(event);
  err_fd:
         put_unused_fd(event_fd);
         return err;
@@ -5229,154 +5734,54 @@ err_fd:
   *
   * @attr: attributes of the counter to create
   * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
   */
  struct perf_event *
  perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                pid_t pid,
+                                struct task_struct *task,
                                  perf_overflow_handler_t overflow_handler)
  {
-       struct perf_event *event;
         struct perf_event_context *ctx;
-       int err;
-
-       /*
-        * Get the target context (task or percpu):
-        */
-
-       ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx)) {
-               err = PTR_ERR(ctx);
-               goto err_exit;
-       }
-
-       event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                NULL, overflow_handler, GFP_KERNEL);
-       if (IS_ERR(event)) {
-               err = PTR_ERR(event);
-               goto err_put_context;
-       }
-
-       event->filp = NULL;
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       perf_install_in_context(ctx, event, cpu);
-       ++ctx->generation;
-       mutex_unlock(&ctx->mutex);
-
-       event->owner = current;
-       get_task_struct(current);
-       mutex_lock(&current->perf_event_mutex);
-       list_add_tail(&event->owner_entry, &current->perf_event_list);
-       mutex_unlock(&current->perf_event_mutex);
-
-       return event;
-
- err_put_context:
-       put_ctx(ctx);
- err_exit:
-       return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-             struct task_struct *parent,
-             struct perf_event_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_event *group_leader,
-             struct perf_event_context *child_ctx)
-{
-       struct perf_event *child_event;
-
-       /*
-        * Instead of creating recursive hierarchies of events,
-        * we link inherited events back to the original parent,
-        * which has a filp for sure, which we use as the reference
-        * count:
-        */
-       if (parent_event->parent)
-               parent_event = parent_event->parent;
-
-       child_event = perf_event_alloc(&parent_event->attr,
-                                          parent_event->cpu, child_ctx,
-                                          group_leader, parent_event,
-                                          NULL, GFP_KERNEL);
-       if (IS_ERR(child_event))
-               return child_event;
-       get_ctx(child_ctx);
-
-       /*
-        * Make the child state follow the state of the parent event,
-        * not its attr.disabled bit.  We hold the parent's mutex,
-        * so we won't race with perf_event_{en, dis}able_family.
-        */
-       if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-               child_event->state = PERF_EVENT_STATE_INACTIVE;
-       else
-               child_event->state = PERF_EVENT_STATE_OFF;
-
-       if (parent_event->attr.freq) {
-               u64 sample_period = parent_event->hw.sample_period;
-               struct hw_perf_event *hwc = &child_event->hw;
-
-               hwc->sample_period = sample_period;
-               hwc->last_period   = sample_period;
-
-               local64_set(&hwc->period_left, sample_period);
-       }
-
-       child_event->overflow_handler = parent_event->overflow_handler;
-
-       /*
-        * Link it up in the child's context:
-        */
-       add_event_to_ctx(child_event, child_ctx);
-
-       /*
-        * Get a reference to the parent filp - we will fput it
-        * when the child event exits. This is safe to do because
-        * we are in the parent and we know that the filp still
-        * exists and has a nonzero count:
-        */
-       atomic_long_inc(&parent_event->filp->f_count);
-
-       /*
-        * Link this into the parent event's child list
-        */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
-       list_add_tail(&child_event->child_list, &parent_event->child_list);
-       mutex_unlock(&parent_event->child_mutex);
+       struct perf_event *event;
+       int err;
  
-       return child_event;
-}
+       /*
+        * Get the target context (task or percpu):
+        */
  
-static int inherit_group(struct perf_event *parent_event,
-             struct task_struct *parent,
-             struct perf_event_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_event_context *child_ctx)
-{
-       struct perf_event *leader;
-       struct perf_event *sub;
-       struct perf_event *child_ctr;
+       event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler);
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
+               goto err;
+       }
  
-       leader = inherit_event(parent_event, parent, parent_ctx,
-                                child, NULL, child_ctx);
-       if (IS_ERR(leader))
-               return PTR_ERR(leader);
-       list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-               child_ctr = inherit_event(sub, parent, parent_ctx,
-                                           child, leader, child_ctx);
-               if (IS_ERR(child_ctr))
-                       return PTR_ERR(child_ctr);
+       ctx = find_get_context(event->pmu, task, cpu);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_free;
         }
-       return 0;
+
+       event->filp = NULL;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, event, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+
+       event->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_event_mutex);
+       list_add_tail(&event->owner_entry, &current->perf_event_list);
+       mutex_unlock(&current->perf_event_mutex);
+
+       return event;
+
+err_free:
+       free_event(event);
+err:
+       return ERR_PTR(err);
  }
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
  
  static void sync_child_event(struct perf_event *child_event,
                                struct task_struct *child)
@@ -5434,16 +5839,13 @@ __perf_event_exit_task(struct perf_event *child_event,
         }
  }
  
-/*
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
  {
         struct perf_event *child_event, *tmp;
         struct perf_event_context *child_ctx;
         unsigned long flags;
  
-       if (likely(!child->perf_event_ctxp)) {
+       if (likely(!child->perf_event_ctxp[ctxn])) {
                 perf_event_task(child, NULL, 0);
                 return;
         }
@@ -5455,7 +5857,7 @@ void perf_event_exit_task(struct task_struct *child)
          * scheduled, so we are now safe from rescheduling changing
          * our context.
          */
-       child_ctx = child->perf_event_ctxp;
+       child_ctx = child->perf_event_ctxp[ctxn];
         __perf_event_task_sched_out(child_ctx);
  
         /*
@@ -5464,7 +5866,7 @@ void perf_event_exit_task(struct task_struct *child)
          * incremented the context's refcount before we do put_ctx below.
          */
         raw_spin_lock(&child_ctx->lock);
-       child->perf_event_ctxp = NULL;
+       child->perf_event_ctxp[ctxn] = NULL;
         /*
          * If this context is a clone; unclone it so it can't get
          * swapped to another process while we're removing all
@@ -5517,6 +5919,17 @@ again:
         put_ctx(child_ctx);
  }
  
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+       int ctxn;
+
+       for_each_task_context_nr(ctxn)
+               perf_event_exit_task_context(child, ctxn);
+}
+
  static void perf_free_event(struct perf_event *event,
                             struct perf_event_context *ctx)
  {
@@ -5538,48 +5951,165 @@ static void perf_free_event(struct perf_event *event,
  
  /*
   * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
   */
  void perf_event_free_task(struct task_struct *task)
  {
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event_context *ctx;
         struct perf_event *event, *tmp;
+       int ctxn;
  
-       if (!ctx)
-               return;
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (!ctx)
+                       continue;
  
-       mutex_lock(&ctx->mutex);
+               mutex_lock(&ctx->mutex);
  again:
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               perf_free_event(event, ctx);
+               list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+                               group_entry)
+                       perf_free_event(event, ctx);
  
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                                group_entry)
-               perf_free_event(event, ctx);
+               list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+                               group_entry)
+                       perf_free_event(event, ctx);
  
-       if (!list_empty(&ctx->pinned_groups) ||
-           !list_empty(&ctx->flexible_groups))
-               goto again;
+               if (!list_empty(&ctx->pinned_groups) ||
+                               !list_empty(&ctx->flexible_groups))
+                       goto again;
  
-       mutex_unlock(&ctx->mutex);
+               mutex_unlock(&ctx->mutex);
  
-       put_ctx(ctx);
+               put_ctx(ctx);
+       }
+}
+
+void perf_event_delayed_put(struct task_struct *task)
+{
+       int ctxn;
+
+       for_each_task_context_nr(ctxn)
+               WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event *group_leader,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *child_event;
+       unsigned long flags;
+
+       /*
+        * Instead of creating recursive hierarchies of events,
+        * we link inherited events back to the original parent,
+        * which has a filp for sure, which we use as the reference
+        * count:
+        */
+       if (parent_event->parent)
+               parent_event = parent_event->parent;
+
+       child_event = perf_event_alloc(&parent_event->attr,
+                                          parent_event->cpu,
+                                          group_leader, parent_event,
+                                          NULL);
+       if (IS_ERR(child_event))
+               return child_event;
+       get_ctx(child_ctx);
+
+       /*
+        * Make the child state follow the state of the parent event,
+        * not its attr.disabled bit.  We hold the parent's mutex,
+        * so we won't race with perf_event_{en, dis}able_family.
+        */
+       if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+               child_event->state = PERF_EVENT_STATE_INACTIVE;
+       else
+               child_event->state = PERF_EVENT_STATE_OFF;
+
+       if (parent_event->attr.freq) {
+               u64 sample_period = parent_event->hw.sample_period;
+               struct hw_perf_event *hwc = &child_event->hw;
+
+               hwc->sample_period = sample_period;
+               hwc->last_period   = sample_period;
+
+               local64_set(&hwc->period_left, sample_period);
+       }
+
+       child_event->ctx = child_ctx;
+       child_event->overflow_handler = parent_event->overflow_handler;
+
+       /*
+        * Link it up in the child's context:
+        */
+       raw_spin_lock_irqsave(&child_ctx->lock, flags);
+       add_event_to_ctx(child_event, child_ctx);
+       raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+       /*
+        * Get a reference to the parent filp - we will fput it
+        * when the child event exits. This is safe to do because
+        * we are in the parent and we know that the filp still
+        * exists and has a nonzero count:
+        */
+       atomic_long_inc(&parent_event->filp->f_count);
+
+       /*
+        * Link this into the parent event's child list
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_add_tail(&child_event->child_list, &parent_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *leader;
+       struct perf_event *sub;
+       struct perf_event *child_ctr;
+
+       leader = inherit_event(parent_event, parent, parent_ctx,
+                                child, NULL, child_ctx);
+       if (IS_ERR(leader))
+               return PTR_ERR(leader);
+       list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+               child_ctr = inherit_event(sub, parent, parent_ctx,
+                                           child, leader, child_ctx);
+               if (IS_ERR(child_ctr))
+                       return PTR_ERR(child_ctr);
+       }
+       return 0;
  }
  
  static int
  inherit_task_group(struct perf_event *event, struct task_struct *parent,
                    struct perf_event_context *parent_ctx,
-                  struct task_struct *child,
+                  struct task_struct *child, int ctxn,
                    int *inherited_all)
  {
         int ret;
-       struct perf_event_context *child_ctx = child->perf_event_ctxp;
+       struct perf_event_context *child_ctx;
  
         if (!event->attr.inherit) {
                 *inherited_all = 0;
                 return 0;
         }
  
+               child_ctx = child->perf_event_ctxp[ctxn];
         if (!child_ctx) {
                 /*
                  * This is executed from the parent task context, so
@@ -5588,14 +6118,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                  * child.
                  */
  
-               child_ctx = kzalloc(sizeof(struct perf_event_context),
-                                   GFP_KERNEL);
+               child_ctx = alloc_perf_context(event->pmu, child);
                 if (!child_ctx)
                         return -ENOMEM;
  
-               __perf_event_init_context(child_ctx, child);
-               child->perf_event_ctxp = child_ctx;
-               get_task_struct(child);
+               child->perf_event_ctxp[ctxn] = child_ctx;
         }
  
         ret = inherit_group(event, parent, parent_ctx,
@@ -5607,11 +6134,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
         return ret;
  }
  
-
  /*
   * Initialize the perf_event context in task_struct
   */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
  {
         struct perf_event_context *child_ctx, *parent_ctx;
         struct perf_event_context *cloned_ctx;
@@ -5620,19 +6146,19 @@ int perf_event_init_task(struct task_struct *child)
         int inherited_all = 1;
         int ret = 0;
  
-       child->perf_event_ctxp = NULL;
+       child->perf_event_ctxp[ctxn] = NULL;
  
         mutex_init(&child->perf_event_mutex);
         INIT_LIST_HEAD(&child->perf_event_list);
  
-       if (likely(!parent->perf_event_ctxp))
+       if (likely(!parent->perf_event_ctxp[ctxn]))
                 return 0;
  
         /*
          * If the parent's context is a clone, pin it so it won't get
          * swapped under us.
          */
-       parent_ctx = perf_pin_task_context(parent);
+       parent_ctx = perf_pin_task_context(parent, ctxn);
  
         /*
          * No need to check if parent_ctx != NULL here; since we saw
@@ -5652,20 +6178,20 @@ int perf_event_init_task(struct task_struct *child)
          * the list, not manipulating it:
          */
         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx, child,
-                                        &inherited_all);
+               ret = inherit_task_group(event, parent, parent_ctx,
+                                        child, ctxn, &inherited_all);
                 if (ret)
                         break;
         }
  
         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx, child,
-                                        &inherited_all);
+               ret = inherit_task_group(event, parent, parent_ctx,
+                                        child, ctxn, &inherited_all);
                 if (ret)
                         break;
         }
  
-       child_ctx = child->perf_event_ctxp;
+       child_ctx = child->perf_event_ctxp[ctxn];
  
         if (child_ctx && inherited_all) {
                 /*
@@ -5694,63 +6220,98 @@ int perf_event_init_task(struct task_struct *child)
         return ret;
  }
  
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+       int ctxn, ret;
+
+       for_each_task_context_nr(ctxn) {
+               ret = perf_event_init_context(child, ctxn);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
  static void __init perf_event_init_all_cpus(void)
  {
+       struct swevent_htable *swhash;
         int cpu;
-       struct perf_cpu_context *cpuctx;
  
         for_each_possible_cpu(cpu) {
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               mutex_init(&cpuctx->hlist_mutex);
-               __perf_event_init_context(&cpuctx->ctx, NULL);
+               swhash = &per_cpu(swevent_htable, cpu);
+               mutex_init(&swhash->hlist_mutex);
+               INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
         }
  }
  
  static void __cpuinit perf_event_init_cpu(int cpu)
  {
-       struct perf_cpu_context *cpuctx;
-
-       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       spin_lock(&perf_resource_lock);
-       cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
-       spin_unlock(&perf_resource_lock);
-
-       mutex_lock(&cpuctx->hlist_mutex);
-       if (cpuctx->hlist_refcount > 0) {
+       mutex_lock(&swhash->hlist_mutex);
+       if (swhash->hlist_refcount > 0) {
                 struct swevent_hlist *hlist;
  
-               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
-               WARN_ON_ONCE(!hlist);
-               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+               hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+               WARN_ON(!hlist);
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
         }
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_unlock(&swhash->hlist_mutex);
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = &cpuctx->ctx;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+       WARN_ON(!irqs_disabled());
+
+       list_del_init(&cpuctx->rotation_list);
+}
+
+static void __perf_event_exit_context(void *__info)
+{
+       struct perf_event_context *ctx = __info;
         struct perf_event *event, *tmp;
  
+       perf_pmu_rotate_stop(ctx->pmu);
+
         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
                 __perf_event_remove_from_context(event);
         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                 __perf_event_remove_from_context(event);
  }
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int idx;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+
+               mutex_lock(&ctx->mutex);
+               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+               mutex_unlock(&ctx->mutex);
+       }
+       srcu_read_unlock(&pmus_srcu, idx);
+}
+
  static void perf_event_exit_cpu(int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_event_context *ctx = &cpuctx->ctx;
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       mutex_lock(&cpuctx->hlist_mutex);
-       swevent_hlist_release(cpuctx);
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
+       swevent_hlist_release(swhash);
+       mutex_unlock(&swhash->hlist_mutex);
  
-       mutex_lock(&ctx->mutex);
-       smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-       mutex_unlock(&ctx->mutex);
+       perf_event_exit_cpu_context(cpu);
  }
  #else
  static inline void perf_event_exit_cpu(int cpu) { }
@@ -5780,118 +6341,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-       .notifier_call          = perf_cpu_notify,
-       .priority               = 20,
-};
-
  void __init perf_event_init(void)
  {
         perf_event_init_all_cpus();
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-                       (void *)(long)smp_processor_id());
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-                       (void *)(long)smp_processor_id());
-       register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-                                       struct sysdev_class_attribute *attr,
-                                       char *buf)
-{
-       return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-                       struct sysdev_class_attribute *attr,
-                       const char *buf,
-                       size_t count)
-{
-       struct perf_cpu_context *cpuctx;
-       unsigned long val;
-       int err, cpu, mpt;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > perf_max_events)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_reserved_percpu = val;
-       for_each_online_cpu(cpu) {
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               raw_spin_lock_irq(&cpuctx->ctx.lock);
-               mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-                         perf_max_events - perf_reserved_percpu);
-               cpuctx->max_pertask = mpt;
-               raw_spin_unlock_irq(&cpuctx->ctx.lock);
-       }
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-                                   struct sysdev_class_attribute *attr,
-                                   char *buf)
-{
-       return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-                   struct sysdev_class_attribute *attr,
-                   const char *buf, size_t count)
-{
-       unsigned long val;
-       int err;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > 1)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_overcommit = val;
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-                               reserve_percpu,
-                               0644,
-                               perf_show_reserve_percpu,
-                               perf_set_reserve_percpu
-                       );
-
-static SYSDEV_CLASS_ATTR(
-                               overcommit,
-                               0644,
-                               perf_show_overcommit,
-                               perf_set_overcommit
-                       );
-
-static struct attribute *perfclass_attrs[] = {
-       &attr_reserve_percpu.attr,
-       &attr_overcommit.attr,
-       NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-       .attrs                  = perfclass_attrs,
-       .name                   = "perf_events",
-};
-
-static int __init perf_event_sysfs_init(void)
-{
-       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-                                 &perfclass_attr_group);
+       init_srcu_struct(&pmus_srcu);
+       perf_pmu_register(&perf_swevent);
+       perf_pmu_register(&perf_cpu_clock);
+       perf_pmu_register(&perf_task_clock);
+       perf_tp_register();
+       perf_cpu_notifier(perf_cpu_notify);
  }
-device_initcall(perf_event_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c

index dc85ceb908322cad7196339f4df8dd58c37b1cec..c0d2067f3e0d81bb45c3372bb39358164612f3a6 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3584,7 +3584,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr);
+       perf_event_task_tick();
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index fa7ece649fe1bcad7b0db621fa8579e91860fb04..65fb077ea79c147c21d6a8a4890a2ec6d17712bc 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
         FTRACE_ENABLE_CALLS             = (1 << 0),
         FTRACE_DISABLE_CALLS            = (1 << 1),
         FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-       FTRACE_ENABLE_MCOUNT            = (1 << 3),
-       FTRACE_DISABLE_MCOUNT           = (1 << 4),
-       FTRACE_START_FUNC_RET           = (1 << 5),
-       FTRACE_STOP_FUNC_RET            = (1 << 6),
+       FTRACE_START_FUNC_RET           = (1 << 3),
+       FTRACE_STOP_FUNC_RET            = (1 << 4),
  };
  
  static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
  
  static void ftrace_startup_sysctl(void)
  {
-       int command = FTRACE_ENABLE_MCOUNT;
-
         if (unlikely(ftrace_disabled))
                 return;
  
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
         saved_ftrace_func = NULL;
         /* ftrace_start_up is true if we want ftrace running */
         if (ftrace_start_up)
-               command |= FTRACE_ENABLE_CALLS;
-
-       ftrace_run_update_code(command);
+               ftrace_run_update_code(FTRACE_ENABLE_CALLS);
  }
  
  static void ftrace_shutdown_sysctl(void)
  {
-       int command = FTRACE_DISABLE_MCOUNT;
-
         if (unlikely(ftrace_disabled))
                 return;
  
         /* ftrace_start_up is true if ftrace is running */
         if (ftrace_start_up)
-               command |= FTRACE_DISABLE_CALLS;
-
-       ftrace_run_update_code(command);
+               ftrace_run_update_code(FTRACE_DISABLE_CALLS);
  }
  
  static cycle_t         ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
  #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
  
  struct ftrace_iterator {
-       struct ftrace_page      *pg;
-       int                     hidx;
-       int                     idx;
-       unsigned                flags;
-       struct trace_parser     parser;
+       loff_t                          pos;
+       loff_t                          func_pos;
+       struct ftrace_page              *pg;
+       struct dyn_ftrace               *func;
+       struct ftrace_func_probe        *probe;
+       struct trace_parser             parser;
+       int                             hidx;
+       int                             idx;
+       unsigned                        flags;
  };
  
  static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
  {
         struct ftrace_iterator *iter = m->private;
-       struct hlist_node *hnd = v;
+       struct hlist_node *hnd = NULL;
         struct hlist_head *hhd;
  
-       WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
-
         (*pos)++;
+       iter->pos = *pos;
  
+       if (iter->probe)
+               hnd = &iter->probe->node;
   retry:
         if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                 return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
                 }
         }
  
-       return hnd;
+       if (WARN_ON_ONCE(!hnd))
+               return NULL;
+
+       iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+       return iter;
  }
  
  static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
         void *p = NULL;
         loff_t l;
  
-       if (!(iter->flags & FTRACE_ITER_HASH))
-               *pos = 0;
-
-       iter->flags |= FTRACE_ITER_HASH;
+       if (iter->func_pos > *pos)
+               return NULL;
  
         iter->hidx = 0;
-       for (l = 0; l <= *pos; ) {
-               p = t_hash_next(m, p, &l);
+       for (l = 0; l <= (*pos - iter->func_pos); ) {
+               p = t_hash_next(m, &l);
                 if (!p)
                         break;
         }
-       return p;
+       if (!p)
+               return NULL;
+
+       /* Only set this if we have an item */
+       iter->flags |= FTRACE_ITER_HASH;
+
+       return iter;
  }
  
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
  {
         struct ftrace_func_probe *rec;
-       struct hlist_node *hnd = v;
  
-       rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+       rec = iter->probe;
+       if (WARN_ON_ONCE(!rec))
+               return -EIO;
  
         if (rec->ops->print)
                 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
         struct dyn_ftrace *rec = NULL;
  
         if (iter->flags & FTRACE_ITER_HASH)
-               return t_hash_next(m, v, pos);
+               return t_hash_next(m, pos);
  
         (*pos)++;
+       iter->pos = *pos;
  
         if (iter->flags & FTRACE_ITER_PRINTALL)
-               return NULL;
+               return t_hash_start(m, pos);
  
   retry:
         if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 }
         }
  
-       return rec;
+       if (!rec)
+               return t_hash_start(m, pos);
+
+       iter->func_pos = *pos;
+       iter->func = rec;
+
+       return iter;
+}
+
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+       iter->pos = 0;
+       iter->func_pos = 0;
+       iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
  }
  
  static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1501,6 +1521,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         loff_t l;
  
         mutex_lock(&ftrace_lock);
+       /*
+        * If an lseek was done, then reset and start from beginning.
+        */
+       if (*pos < iter->pos)
+               reset_iter_read(iter);
+
         /*
          * For set_ftrace_filter reading, if we have the filter
          * off, we can short cut and just print out that all
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         if (iter->flags & FTRACE_ITER_HASH)
                 return t_hash_start(m, pos);
  
+       /*
+        * Unfortunately, we need to restart at ftrace_pages_start
+        * every time we let go of the ftrace_mutex. This is because
+        * those pointers can change without the lock.
+        */
         iter->pg = ftrace_pages_start;
         iter->idx = 0;
         for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                         break;
         }
  
-       if (!p && iter->flags & FTRACE_ITER_FILTER)
-               return t_hash_start(m, pos);
+       if (!p) {
+               if (iter->flags & FTRACE_ITER_FILTER)
+                       return t_hash_start(m, pos);
  
-       return p;
+               return NULL;
+       }
+
+       return iter;
  }
  
  static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
  static int t_show(struct seq_file *m, void *v)
  {
         struct ftrace_iterator *iter = m->private;
-       struct dyn_ftrace *rec = v;
+       struct dyn_ftrace *rec;
  
         if (iter->flags & FTRACE_ITER_HASH)
-               return t_hash_show(m, v);
+               return t_hash_show(m, iter);
  
         if (iter->flags & FTRACE_ITER_PRINTALL) {
                 seq_printf(m, "#### all functions enabled ####\n");
                 return 0;
         }
  
+       rec = iter->func;
+
         if (!rec)
                 return 0;
  
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
         .open = ftrace_filter_open,
         .read = seq_read,
         .write = ftrace_filter_write,
-       .llseek = no_llseek,
+       .llseek = ftrace_regex_lseek,
         .release = ftrace_filter_release,
  };
  
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c

index 492197e2f86cda2792603186b59ad3fdd17c448d..4e2f03410377af2ee6f0fb46a7102e1ae76eba0c 100644 (file)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
  }
  EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
  
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       return local_read(&cpu_buffer->entries) -
+               (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
  /**
   * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
   * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
  unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
  {
         struct ring_buffer_per_cpu *cpu_buffer;
-       unsigned long ret;
  
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
                 return 0;
  
         cpu_buffer = buffer->buffers[cpu];
-       ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-               - cpu_buffer->read;
  
-       return ret;
+       return rb_num_of_entries(cpu_buffer);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
  
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
         /* if you care about this being correct, lock the buffer */
         for_each_buffer_cpu(buffer, cpu) {
                 cpu_buffer = buffer->buffers[cpu];
-               entries += (local_read(&cpu_buffer->entries) -
-                           local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
+               entries += rb_num_of_entries(cpu_buffer);
         }
  
         return entries;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c

index 31cc4cb0dbf2afaa49d5f7428f5cce31d535211d..39c059ca670e64156e6681782ffa708c6b8d720f 100644 (file)
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
  #include <linux/kprobes.h>
  #include "trace.h"
  
-static char *perf_trace_buf[4];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  
  /*
   * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int    total_ref_count;
  static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                  struct perf_event *p_event)
  {
-       struct hlist_head *list;
+       struct hlist_head __percpu *list;
         int ret = -ENOMEM;
         int cpu;
  
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
         tp_event->perf_events = list;
  
         if (!total_ref_count) {
-               char *buf;
+               char __percpu *buf;
                 int i;
  
-               for (i = 0; i < 4; i++) {
-                       buf = (char *)alloc_percpu(perf_trace_t);
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+                       buf = (char __percpu *)alloc_percpu(perf_trace_t);
                         if (!buf)
                                 goto fail;
  
@@ -65,7 +65,7 @@ fail:
         if (!total_ref_count) {
                 int i;
  
-               for (i = 0; i < 4; i++) {
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                         free_percpu(perf_trace_buf[i]);
                         perf_trace_buf[i] = NULL;
                 }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
         return ret;
  }
  
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
  {
         struct ftrace_event_call *tp_event = p_event->tp_event;
+       struct hlist_head __percpu *pcpu_list;
         struct hlist_head *list;
  
-       list = tp_event->perf_events;
-       if (WARN_ON_ONCE(!list))
+       pcpu_list = tp_event->perf_events;
+       if (WARN_ON_ONCE(!pcpu_list))
                 return -EINVAL;
  
-       list = this_cpu_ptr(list);
+       if (!(flags & PERF_EF_START))
+               p_event->hw.state = PERF_HES_STOPPED;
+
+       list = this_cpu_ptr(pcpu_list);
         hlist_add_head_rcu(&p_event->hlist_entry, list);
  
         return 0;
  }
  
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
  {
         hlist_del_rcu(&p_event->hlist_entry);
  }
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
         tp_event->perf_events = NULL;
  
         if (!--total_ref_count) {
-               for (i = 0; i < 4; i++) {
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                         free_percpu(perf_trace_buf[i]);
                         perf_trace_buf[i] = NULL;
                 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 4c758f146328f18ce82a318fb60a0413006aca8f..398c0e8b332c1840e16bc0599c1230f9594d29a2 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
  
  enum {
         FORMAT_HEADER           = 1,
-       FORMAT_PRINTFMT         = 2,
+       FORMAT_FIELD_SEPERATOR  = 2,
+       FORMAT_PRINTFMT         = 3,
  };
  
  static void *f_next(struct seq_file *m, void *v, loff_t *pos)
  {
         struct ftrace_event_call *call = m->private;
         struct ftrace_event_field *field;
-       struct list_head *head;
+       struct list_head *common_head = &ftrace_common_fields;
+       struct list_head *head = trace_get_fields(call);
  
         (*pos)++;
  
         switch ((unsigned long)v) {
         case FORMAT_HEADER:
-               head = &ftrace_common_fields;
+               if (unlikely(list_empty(common_head)))
+                       return NULL;
+
+               field = list_entry(common_head->prev,
+                                  struct ftrace_event_field, link);
+               return field;
  
+       case FORMAT_FIELD_SEPERATOR:
                 if (unlikely(list_empty(head)))
                         return NULL;
  
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
                 return NULL;
         }
  
-       head = trace_get_fields(call);
-
-       /*
-        * To separate common fields from event fields, the
-        * LSB is set on the first event field. Clear it in case.
-        */
-       v = (void *)((unsigned long)v & ~1L);
-
         field = v;
-       /*
-        * If this is a common field, and at the end of the list, then
-        * continue with main list.
-        */
-       if (field->link.prev == &ftrace_common_fields) {
-               if (unlikely(list_empty(head)))
-                       return NULL;
-               field = list_entry(head->prev, struct ftrace_event_field, link);
-               /* Set the LSB to notify f_show to print an extra newline */
-               field = (struct ftrace_event_field *)
-                       ((unsigned long)field | 1);
-               return field;
-       }
-
-       /* If we are done tell f_show to print the format */
-       if (field->link.prev == head)
+       if (field->link.prev == common_head)
+               return (void *)FORMAT_FIELD_SEPERATOR;
+       else if (field->link.prev == head)
                 return (void *)FORMAT_PRINTFMT;
  
         field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
                 seq_printf(m, "format:\n");
                 return 0;
  
+       case FORMAT_FIELD_SEPERATOR:
+               seq_putc(m, '\n');
+               return 0;
+
         case FORMAT_PRINTFMT:
                 seq_printf(m, "\nprint fmt: %s\n",
                            call->print_fmt);
                 return 0;
         }
  
-       /*
-        * To separate common fields from event fields, the
-        * LSB is set on the first event field. Clear it and
-        * print a newline if it is set.
-        */
-       if ((unsigned long)v & 1) {
-               seq_putc(m, '\n');
-               v = (void *)((unsigned long)v & ~1L);
-       }
-
         field = v;
  
         /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index 6f233698518ede15cc9302e889de9f108aa0f1cb..02c708ae0d420180d1125580820d9ff85cb3d94d 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
  #include "trace.h"
  #include "trace_output.h"
  
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
  struct fgraph_cpu_data {
         pid_t           last_pid;
         int             depth;
+       int             depth_irq;
         int             ignore;
         unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
  };
  
  struct fgraph_data {
-       struct fgraph_cpu_data          *cpu_data;
+       struct fgraph_cpu_data __percpu *cpu_data;
  
         /* Place to preserve last processed entry. */
         struct ftrace_graph_ent_entry   ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
  #define TRACE_GRAPH_PRINT_PROC         0x8
  #define TRACE_GRAPH_PRINT_DURATION     0x10
  #define TRACE_GRAPH_PRINT_ABS_TIME     0x20
+#define TRACE_GRAPH_PRINT_IRQS         0x40
  
  static struct tracer_opt trace_opts[] = {
         /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
         { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
         /* Display absolute time of an entry */
         { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+       /* Display interrupts */
+       { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
         { } /* Empty entry */
  };
  
  static struct tracer_flags tracer_flags = {
         /* Don't display overruns and proc by default */
         .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-              TRACE_GRAPH_PRINT_DURATION,
+              TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
         .opts = trace_opts
  };
  
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
         return 1;
  }
  
+static inline int ftrace_graph_ignore_irqs(void)
+{
+       if (!ftrace_graph_skip_irqs)
+               return 0;
+
+       return in_irq();
+}
+
  int trace_graph_entry(struct ftrace_graph_ent *trace)
  {
         struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                 return 0;
  
         /* trace it when it is-nested-in or is a function enabled. */
-       if (!(trace->depth || ftrace_graph_addr(trace->func)))
+       if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+             ftrace_graph_ignore_irqs())
                 return 0;
  
         local_irq_save(flags);
@@ -855,6 +871,92 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
         return 0;
  }
  
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+               unsigned long addr, int depth)
+{
+       int cpu = iter->cpu;
+       struct fgraph_data *data = iter->private;
+       int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+       if (flags & TRACE_GRAPH_PRINT_IRQS)
+               return 0;
+
+       /*
+        * We are inside the irq code
+        */
+       if (*depth_irq >= 0)
+               return 1;
+
+       if ((addr < (unsigned long)__irqentry_text_start) ||
+           (addr >= (unsigned long)__irqentry_text_end))
+               return 0;
+
+       /*
+        * We are entering irq code.
+        */
+       *depth_irq = depth;
+       return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+       int cpu = iter->cpu;
+       struct fgraph_data *data = iter->private;
+       int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+       if (flags & TRACE_GRAPH_PRINT_IRQS)
+               return 0;
+
+       /*
+        * We are not inside the irq code.
+        */
+       if (*depth_irq == -1)
+               return 0;
+
+       /*
+        * We are inside the irq code, and this is returning entry.
+        * Let's not trace it and clear the entry depth, since
+        * we are out of irq code.
+        *
+        * This condition ensures that we 'leave the irq code' once
+        * we are out of the entry depth. Thus protecting us from
+        * the RETURN entry loss.
+        */
+       if (*depth_irq >= depth) {
+               *depth_irq = -1;
+               return 1;
+       }
+
+       /*
+        * We are inside the irq code, and this is not the entry.
+        */
+       return 1;
+}
+
  static enum print_line_t
  print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                         struct trace_iterator *iter, u32 flags)
@@ -865,6 +967,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
         static enum print_line_t ret;
         int cpu = iter->cpu;
  
+       if (check_irq_entry(iter, flags, call->func, call->depth))
+               return TRACE_TYPE_HANDLED;
+
         if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                 return TRACE_TYPE_PARTIAL_LINE;
  
@@ -902,6 +1007,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
         int ret;
         int i;
  
+       if (check_irq_return(iter, flags, trace->depth))
+               return TRACE_TYPE_HANDLED;
+
         if (data) {
                 struct fgraph_cpu_data *cpu_data;
                 int cpu = iter->cpu;
@@ -1210,9 +1318,12 @@ void graph_trace_open(struct trace_iterator *iter)
                 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
                 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+               int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
                 *pid = -1;
                 *depth = 0;
                 *ignore = 0;
+               *depth_irq = -1;
         }
  
         iter->private = data;
@@ -1235,6 +1346,14 @@ void graph_trace_close(struct trace_iterator *iter)
         }
  }
  
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+       if (bit == TRACE_GRAPH_PRINT_IRQS)
+               ftrace_graph_skip_irqs = !set;
+
+       return 0;
+}
+
  static struct trace_event_functions graph_functions = {
         .trace          = print_graph_function_event,
  };
@@ -1261,6 +1380,7 @@ static struct tracer graph_trace __read_mostly = {
         .print_line     = print_graph_function,
         .print_header   = print_graph_headers,
         .flags          = &tracer_flags,
+       .set_flag       = func_graph_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest       = trace_selftest_startup_function_graph,
  #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index 7f9c3c52ecc12ef5d0de1728839218ff87c2dc7b..dc8e16824b51b18c8c113f0d670d70cb58ede740 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
  #endif
  
-static int __read_mostly did_panic;
  static int __initdata no_watchdog;
  
  
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
         return 0;
  }
  
-static int
-watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-       did_panic = 1;
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block panic_block = {
-       .notifier_call = watchdog_panic,
-};
-
  #ifdef CONFIG_HARDLOCKUP_DETECTOR
  static struct perf_event_attr wd_hw_attr = {
         .type           = PERF_TYPE_HARDWARE,
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
         /* Try to register using hardware perf events */
         wd_attr = &wd_hw_attr;
         wd_attr->sample_period = hw_nmi_get_sample_period();
-       event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
         if (!IS_ERR(event)) {
                 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                 goto out_save;
         }
  
         printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
-       return -1;
+       return PTR_ERR(event);
  
         /* success path */
  out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
  static int watchdog_enable(int cpu)
  {
         struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+       int err;
  
         /* enable the perf event */
-       if (watchdog_nmi_enable(cpu) != 0)
-               return -1;
+       err = watchdog_nmi_enable(cpu);
+       if (err)
+               return err;
  
         /* create the watchdog thread */
         if (!p) {
                 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                 if (IS_ERR(p)) {
                         printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                       return -1;
+                       return PTR_ERR(p);
                 }
                 kthread_bind(p, cpu);
                 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
  {
         int cpu;
  
+       if (no_watchdog)
+               return;
+
         for_each_online_cpu(cpu)
                 watchdog_disable(cpu);
  
@@ -526,17 +518,16 @@ static int __cpuinit
  cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  {
         int hotcpu = (unsigned long)hcpu;
+       int err = 0;
  
         switch (action) {
         case CPU_UP_PREPARE:
         case CPU_UP_PREPARE_FROZEN:
-               if (watchdog_prepare_cpu(hotcpu))
-                       return NOTIFY_BAD;
+               err = watchdog_prepare_cpu(hotcpu);
                 break;
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
-               if (watchdog_enable(hotcpu))
-                       return NOTIFY_BAD;
+               err = watchdog_enable(hotcpu);
                 break;
  #ifdef CONFIG_HOTPLUG_CPU
         case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  #endif /* CONFIG_HOTPLUG_CPU */
         }
-       return NOTIFY_OK;
+       return notifier_from_errno(err);
  }
  
  static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
                 return 0;
  
         err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-       WARN_ON(err == NOTIFY_BAD);
+       WARN_ON(notifier_to_errno(err));
  
         cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
         register_cpu_notifier(&cpu_nfb);
  
-       atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-
         return 0;
  }
  early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index 1b4afd2e6ca089de0babdacc5781426ef118da5c..e85d549b6eac20eb040ba4b27d6f4d334e0b5106 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -482,6 +482,7 @@ config PROVE_LOCKING
         select DEBUG_SPINLOCK
         select DEBUG_MUTEXES
         select DEBUG_LOCK_ALLOC
+       select TRACE_IRQFLAGS
         default n
         help
          This feature enables the kernel to prove that all locking
@@ -579,11 +580,10 @@ config DEBUG_LOCKDEP
           of more runtime overhead.
  
  config TRACE_IRQFLAGS
-       depends on DEBUG_KERNEL
         bool
-       default y
-       depends on TRACE_IRQFLAGS_SUPPORT
-       depends on PROVE_LOCKING
+       help
+         Enables hooks to interrupt enabling and disabling for
+         either tracing or lock debugging.
  
  config DEBUG_SPINLOCK_SLEEP
         bool "Spinlock debugging: sleep-inside-spinlock checking"
diff --git a/net/core/datagram.c b/net/core/datagram.c

index 251997a9548362c5ebc8a0a34842971a0198539a..282806ba7a57e60991f2f7806bc3015d9b8596a5 100644 (file)
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
         unlock_sock_fast(sk, slow);
  
         /* skb is now orphaned, can be freed outside of locked section */
+       trace_kfree_skb(skb, skb_free_datagram_locked);
         __kfree_skb(skb);
  }
  EXPORT_SYMBOL(skb_free_datagram_locked);
diff --git a/net/core/dev.c b/net/core/dev.c

index 660dd41aaaa6629c0d59547e04d23353ba935af8..7ec85e27beed840b8da5f9dae620bcd48453f3f7 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,6 +128,8 @@
  #include <linux/jhash.h>
  #include <linux/random.h>
  #include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
  #include <linux/pci.h>
  
  #include "net-sysfs.h"
@@ -1978,6 +1980,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                 }
  
                 rc = ops->ndo_start_xmit(skb, dev);
+               trace_net_dev_xmit(skb, rc);
                 if (rc == NETDEV_TX_OK)
                         txq_trans_update(txq);
                 return rc;
@@ -1998,6 +2001,7 @@ gso:
                         skb_dst_drop(nskb);
  
                 rc = ops->ndo_start_xmit(nskb, dev);
+               trace_net_dev_xmit(nskb, rc);
                 if (unlikely(rc != NETDEV_TX_OK)) {
                         if (rc & ~NETDEV_TX_MASK)
                                 goto out_kfree_gso_skb;
@@ -2186,6 +2190,7 @@ int dev_queue_xmit(struct sk_buff *skb)
  #ifdef CONFIG_NET_CLS_ACT
         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
  #endif
+       trace_net_dev_queue(skb);
         if (q->enqueue) {
                 rc = __dev_xmit_skb(skb, q, dev, txq);
                 goto out;
@@ -2512,6 +2517,7 @@ int netif_rx(struct sk_buff *skb)
         if (netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
+       trace_netif_rx(skb);
  #ifdef CONFIG_RPS
         {
                 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h)
                         clist = clist->next;
  
                         WARN_ON(atomic_read(&skb->users));
+                       trace_kfree_skb(skb, net_tx_action);
                         __kfree_skb(skb);
                 }
         }
@@ -2828,6 +2835,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
         if (!netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
+       trace_netif_receive_skb(skb);
         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
                 return NET_RX_SUCCESS;
  
diff --git a/net/core/net-traces.c b/net/core/net-traces.c

index afa6380ed88ac2ee0b5cd8c8a731dcb9f3dcb809..7f1bb2aba03bf0e501ee1625ba6f07163b528ab9 100644 (file)
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/skb.h>
+#include <trace/events/net.h>
  #include <trace/events/napi.h>
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index c83b421341c01b4a1de702e42d3380fac2f0aa2b..56ba3c4e4761c6584375a8acf45022bc5c40d6e8 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
                 smp_rmb();
         else if (likely(!atomic_dec_and_test(&skb->users)))
                 return;
+       trace_consume_skb(skb);
         __kfree_skb(skb);
  }
  EXPORT_SYMBOL(consume_skb);
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt

index 5164a655c39f60b578c8642f2caabe8af66fa016..b2c63309a65165b471822e99268c828bbdb07777 100644 (file)
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -8,7 +8,7 @@ perf-annotate - Read perf.data (created by perf record) and display annotated co
  SYNOPSIS
  --------
  [verse]
-'perf annotate' [-i <file> | --input=file] symbol_name
+'perf annotate' [-i <file> | --input=file] [symbol_name]
  
  DESCRIPTION
  -----------
@@ -24,6 +24,13 @@ OPTIONS
  --input=::
          Input file name. (default: perf.data)
  
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
+       present, as when piping to other commands, the stdio interface is
+       used. This interfaces starts by centering on the line with more
+       samples, TAB/UNTAB cycles thru the lines with more samples.
+
  SEE ALSO
  --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index abfabe9147a4f2a48b6fd47bfdb758de2a3f3eea..12052c9ed0babfc3a1c93cc01758ec3b7747ee10 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -65,6 +65,13 @@ OPTIONS
                  the tree is considered as a new profiled object. +
         Default: fractal,0.5.
  
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface, that is integrated with annotate and allows
+        zooming into DSOs or threads, among other features. Use of --tui
+       requires a tty, if one is not present, as when piping to other
+       commands, the stdio interface is used.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index 4f1fa77c1feb0b7a854ab8a85bd21682cbc66377..fe1e30722f3b2a9c8d0d2bc1501b0be693379aba 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -313,6 +313,9 @@ TEST_PROGRAMS =
  
  SCRIPT_SH += perf-archive.sh
  
+grep-libs = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
  #
  # No Perl scripts right now:
  #
@@ -588,14 +591,17 @@ endif
  ifdef NO_LIBPERL
         BASIC_CFLAGS += -DNO_LIBPERL
  else
-       PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
+       PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+       PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+       PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
         PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
         FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
  
         ifneq ($(call try-cc,$(SOURCE_PERL_EMBED),$(FLAGS_PERL_EMBED)),y)
                 BASIC_CFLAGS += -DNO_LIBPERL
         else
-               ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PERL_EMBED_LDFLAGS)
+               EXTLIBS += $(PERL_EMBED_LIBADD)
                 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
                 LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
         endif
@@ -604,13 +610,16 @@ endif
  ifdef NO_LIBPYTHON
         BASIC_CFLAGS += -DNO_LIBPYTHON
  else
-       PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null`
+       PYTHON_EMBED_LDOPTS = $(shell python-config --ldflags 2>/dev/null)
+       PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+       PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
         PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
         FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
         ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y)
                 BASIC_CFLAGS += -DNO_LIBPYTHON
         else
-               ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
+               EXTLIBS += $(PYTHON_EMBED_LIBADD)
                 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
                 LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
         endif
@@ -653,6 +662,15 @@ else
         endif
  endif
  
+
+ifdef NO_STRLCPY
+       BASIC_CFLAGS += -DNO_STRLCPY
+else
+       ifneq ($(call try-cc,$(SOURCE_STRLCPY),),y)
+               BASIC_CFLAGS += -DNO_STRLCPY
+       endif
+endif
+
  ifndef CC_LD_DYNPATH
         ifdef NO_R_TO_GCC_LINKER
                 # Some gcc does not accept and pass -R to the linker to specify
@@ -910,8 +928,8 @@ $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
                 $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
  
  $(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(OUTPUT)perf.o \
-               $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \
+               $(BUILTIN_OBJS) $(LIBS) -o $@
  
  $(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
         $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 1478dc64bf157fc0f226cba51eef80ec8646c15f..6d5604d8df9599acb55d87017f5d58e19d906395 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,7 +28,7 @@
  
  static char            const *input_name = "perf.data";
  
-static bool            force;
+static bool            force, use_tui, use_stdio;
  
  static bool            full_paths;
  
@@ -321,7 +321,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he)
  
  static void hists__find_annotations(struct hists *self)
  {
-       struct rb_node *first = rb_first(&self->entries), *nd = first;
+       struct rb_node *nd = rb_first(&self->entries), *next;
         int key = KEY_RIGHT;
  
         while (nd) {
@@ -343,20 +343,19 @@ find_next:
  
                 if (use_browser > 0) {
                         key = hist_entry__tui_annotate(he);
-                       if (is_exit_key(key))
-                               break;
                         switch (key) {
                         case KEY_RIGHT:
-                       case '\t':
-                               nd = rb_next(nd);
+                               next = rb_next(nd);
                                 break;
                         case KEY_LEFT:
-                               if (nd == first)
-                                       continue;
-                               nd = rb_prev(nd);
-                       default:
+                               next = rb_prev(nd);
                                 break;
+                       default:
+                               return;
                         }
+
+                       if (next != NULL)
+                               nd = next;
                 } else {
                         hist_entry__tty_annotate(he);
                         nd = rb_next(nd);
@@ -428,6 +427,8 @@ static const struct option options[] = {
                     "be more verbose (show symbol address, etc)"),
         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                     "dump raw trace in ASCII"),
+       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                    "file", "vmlinux pathname"),
         OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
@@ -443,6 +444,11 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
  {
         argc = parse_options(argc, argv, options, annotate_usage, 0);
  
+       if (use_stdio)
+               use_browser = 0;
+       else if (use_tui)
+               use_browser = 1;
+
         setup_browser();
  
         symbol_conf.priv_size = sizeof(struct sym_priv);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 55fc1f46892a6a920411db7dc91226bcbe6a7f82..5de405d452300318541338293563d8ebc41ccb87 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -32,7 +32,7 @@
  
  static char            const *input_name = "perf.data";
  
-static bool            force;
+static bool            force, use_tui, use_stdio;
  static bool            hide_unresolved;
  static bool            dont_use_callchains;
  
@@ -107,7 +107,8 @@ static int perf_session__add_hist_entry(struct perf_session *self,
                 goto out_free_syms;
         err = 0;
         if (symbol_conf.use_callchain) {
-               err = append_chain(he->callchain, data->callchain, syms, data->period);
+               err = callchain_append(he->callchain, data->callchain, syms,
+                                      data->period);
                 if (err)
                         goto out_free_syms;
         }
@@ -450,6 +451,8 @@ static const struct option options[] = {
                     "Show per-thread event counters"),
         OPT_STRING(0, "pretty", &pretty_printing_style, "key",
                    "pretty printing style key: normal raw"),
+       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
         OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
                    "sort by key(s): pid, comm, dso, symbol, parent"),
         OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
@@ -482,8 +485,15 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
  {
         argc = parse_options(argc, argv, options, report_usage, 0);
  
+       if (use_stdio)
+               use_browser = 0;
+       else if (use_tui)
+               use_browser = 1;
+
         if (strcmp(input_name, "-") != 0)
                 setup_browser();
+       else
+               use_browser = 0;
         /*
          * Only in the newt browser we are doing integrated annotation,
          * so don't allocate extra space that won't be used in the stdio
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak

index 7a7b6085905382c791834b0c0f35ed5f39658e26..b253db634f04b7e8ddfddd1cc33bb3ce8343a49a 100644 (file)
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -110,6 +110,17 @@ int main(void)
  }
  endef
  
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+       strlcpy(NULL, NULL, 0);
+       return 0;
+}
+endef
+
  # try-cc
  # Usage: option = $(call try-cc, source-to-build, cc-options)
  try-cc = $(shell sh -c                                           \
diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record

new file mode 100644 (file)

index 0000000..d931a82
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+perf record -a -e net:net_dev_xmit -e net:net_dev_queue                \
+               -e net:netif_receive_skb -e net:netif_rx                \
+               -e skb:consume_skb -e skb:kfree_skb                     \
+               -e skb:skb_copy_datagram_iovec -e napi:napi_poll        \
+               -e irq:irq_handler_entry -e irq:irq_handler_exit        \
+               -e irq:softirq_entry -e irq:softirq_exit                \
+               -e irq:softirq_raise $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report

new file mode 100644 (file)

index 0000000..c3d0a63
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: display a process of packet and processing time
+# args: [tx] [rx] [dev=] [debug]
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py

new file mode 100644 (file)

index 0000000..9aa0a32
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,464 @@
+# Display a process of packets and processed time.
+# It helps us to investigate networking or network device.
+#
+# options
+# tx: show only tx chart
+# rx: show only rx chart
+# dev=: show only thing related to specified device
+# debug: work with debug mode. It shows buffer status.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+                # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+rx_skb_list = []; # received packet list for matching
+                      # skb_copy_datagram_iovec
+
+buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and
+                      # tx_xmit_list
+of_count_rx_skb_list = 0; # overflow count
+
+tx_queue_list = []; # list of packets which pass through dev_queue_xmit
+of_count_tx_queue_list = 0; # overflow count
+
+tx_xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+of_count_tx_xmit_list = 0; # overflow count
+
+tx_free_list = [];  # list of packets which is freed
+
+# options
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+debug = 0;
+
+# indices of event_info tuple
+EINFO_IDX_NAME=   0
+EINFO_IDX_CONTEXT=1
+EINFO_IDX_CPU=    2
+EINFO_IDX_TIME=   3
+EINFO_IDX_PID=    4
+EINFO_IDX_COMM=   5
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+       return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+       if dev != 0 and hunk['dev'].find(dev) < 0:
+               return
+       print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
+               (hunk['dev'], hunk['len'],
+               nsecs_secs(hunk['queue_t']),
+               nsecs_nsecs(hunk['queue_t'])/1000,
+               diff_msec(hunk['queue_t'], hunk['xmit_t']),
+               diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Format for displaying rx packet processing
+PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
+PF_SOFT_ENTRY="  softirq_entry(+%.3fmsec)"
+PF_NAPI_POLL= "  napi_poll_exit(+%.3fmsec %s)"
+PF_JOINT=     "         |"
+PF_WJOINT=    "         |            |"
+PF_NET_RECV=  "         |---netif_receive_skb(+%.3fmsec skb=%x len=%d)"
+PF_NET_RX=    "         |---netif_rx(+%.3fmsec skb=%x)"
+PF_CPY_DGRAM= "         |      skb_copy_datagram_iovec(+%.3fmsec %d:%s)"
+PF_KFREE_SKB= "         |      kfree_skb(+%.3fmsec location=%x)"
+PF_CONS_SKB=  "         |      consume_skb(+%.3fmsec)"
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+       show_hunk = 0
+       irq_list = hunk['irq_list']
+       cpu = irq_list[0]['cpu']
+       base_t = irq_list[0]['irq_ent_t']
+       # check if this hunk should be showed
+       if dev != 0:
+               for i in range(len(irq_list)):
+                       if irq_list[i]['name'].find(dev) >= 0:
+                               show_hunk = 1
+                               break
+       else:
+               show_hunk = 1
+       if show_hunk == 0:
+               return
+
+       print "%d.%06dsec cpu=%d" % \
+               (nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
+       for i in range(len(irq_list)):
+               print PF_IRQ_ENTRY % \
+                       (diff_msec(base_t, irq_list[i]['irq_ent_t']),
+                       irq_list[i]['irq'], irq_list[i]['name'])
+               print PF_JOINT
+               irq_event_list = irq_list[i]['event_list']
+               for j in range(len(irq_event_list)):
+                       irq_event = irq_event_list[j]
+                       if irq_event['event'] == 'netif_rx':
+                               print PF_NET_RX % \
+                                       (diff_msec(base_t, irq_event['time']),
+                                       irq_event['skbaddr'])
+                               print PF_JOINT
+       print PF_SOFT_ENTRY % \
+               diff_msec(base_t, hunk['sirq_ent_t'])
+       print PF_JOINT
+       event_list = hunk['event_list']
+       for i in range(len(event_list)):
+               event = event_list[i]
+               if event['event_name'] == 'napi_poll':
+                       print PF_NAPI_POLL % \
+                           (diff_msec(base_t, event['event_t']), event['dev'])
+                       if i == len(event_list) - 1:
+                               print ""
+                       else:
+                               print PF_JOINT
+               else:
+                       print PF_NET_RECV % \
+                           (diff_msec(base_t, event['event_t']), event['skbaddr'],
+                               event['len'])
+                       if 'comm' in event.keys():
+                               print PF_WJOINT
+                               print PF_CPY_DGRAM % \
+                                       (diff_msec(base_t, event['comm_t']),
+                                       event['pid'], event['comm'])
+                       elif 'handle' in event.keys():
+                               print PF_WJOINT
+                               if event['handle'] == "kfree_skb":
+                                       print PF_KFREE_SKB % \
+                                               (diff_msec(base_t,
+                                               event['comm_t']),
+                                               event['location'])
+                               elif event['handle'] == "consume_skb":
+                                       print PF_CONS_SKB % \
+                                               diff_msec(base_t,
+                                                       event['comm_t'])
+                       print PF_JOINT
+
+def trace_begin():
+       global show_tx
+       global show_rx
+       global dev
+       global debug
+
+       for i in range(len(sys.argv)):
+               if i == 0:
+                       continue
+               arg = sys.argv[i]
+               if arg == 'tx':
+                       show_tx = 1
+               elif arg =='rx':
+                       show_rx = 1
+               elif arg.find('dev=',0, 4) >= 0:
+                       dev = arg[4:]
+               elif arg == 'debug':
+                       debug = 1
+       if show_tx == 0  and show_rx == 0:
+               show_tx = 1
+               show_rx = 1
+
+def trace_end():
+       # order all events in time
+       all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
+                                           b[EINFO_IDX_TIME]))
+       # process all events
+       for i in range(len(all_event_list)):
+               event_info = all_event_list[i]
+               name = event_info[EINFO_IDX_NAME]
+               if name == 'irq__softirq_exit':
+                       handle_irq_softirq_exit(event_info)
+               elif name == 'irq__softirq_entry':
+                       handle_irq_softirq_entry(event_info)
+               elif name == 'irq__softirq_raise':
+                       handle_irq_softirq_raise(event_info)
+               elif name == 'irq__irq_handler_entry':
+                       handle_irq_handler_entry(event_info)
+               elif name == 'irq__irq_handler_exit':
+                       handle_irq_handler_exit(event_info)
+               elif name == 'napi__napi_poll':
+                       handle_napi_poll(event_info)
+               elif name == 'net__netif_receive_skb':
+                       handle_netif_receive_skb(event_info)
+               elif name == 'net__netif_rx':
+                       handle_netif_rx(event_info)
+               elif name == 'skb__skb_copy_datagram_iovec':
+                       handle_skb_copy_datagram_iovec(event_info)
+               elif name == 'net__net_dev_queue':
+                       handle_net_dev_queue(event_info)
+               elif name == 'net__net_dev_xmit':
+                       handle_net_dev_xmit(event_info)
+               elif name == 'skb__kfree_skb':
+                       handle_kfree_skb(event_info)
+               elif name == 'skb__consume_skb':
+                       handle_consume_skb(event_info)
+       # display receive hunks
+       if show_rx:
+               for i in range(len(receive_hunk_list)):
+                       print_receive(receive_hunk_list[i])
+       # display transmit hunks
+       if show_tx:
+               print "   dev    len      Qdisc        " \
+                       "       netdevice             free"
+               for i in range(len(tx_free_list)):
+                       print_transmit(tx_free_list[i])
+       if debug:
+               print "debug buffer status"
+               print "----------------------------"
+               print "xmit Qdisc:remain:%d overflow:%d" % \
+                       (len(tx_queue_list), of_count_tx_queue_list)
+               print "xmit netdevice:remain:%d overflow:%d" % \
+                       (len(tx_xmit_list), of_count_tx_xmit_list)
+               print "receive:remain:%d overflow:%d" % \
+                       (len(rx_skb_list), of_count_rx_skb_list)
+
+# called from perf, when it finds a correspoinding event
+def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm,
+                       irq, irq_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       irq, irq_name)
+       all_event_list.append(event_info)
+
+def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret)
+       all_event_list.append(event_info)
+
+def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       napi, dev_name)
+       all_event_list.append(event_info)
+
+def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+                       skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+                       skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, skblen, rc, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, rc ,dev_name)
+       all_event_list.append(event_info)
+
+def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, protocol, location):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, protocol, location)
+       all_event_list.append(event_info)
+
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr)
+       all_event_list.append(event_info)
+
+def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm,
+       skbaddr, skblen):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen)
+       all_event_list.append(event_info)
+
+def handle_irq_handler_entry(event_info):
+       (name, context, cpu, time, pid, comm, irq, irq_name) = event_info
+       if cpu not in irq_dic.keys():
+               irq_dic[cpu] = []
+       irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time}
+       irq_dic[cpu].append(irq_record)
+
+def handle_irq_handler_exit(event_info):
+       (name, context, cpu, time, pid, comm, irq, ret) = event_info
+       if cpu not in irq_dic.keys():
+               return
+       irq_record = irq_dic[cpu].pop()
+       if irq != irq_record['irq']:
+               return
+       irq_record.update({'irq_ext_t':time})
+       # if an irq doesn't include NET_RX softirq, drop.
+       if 'event_list' in irq_record.keys():
+               irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_raise(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       if cpu not in irq_dic.keys() \
+       or len(irq_dic[cpu]) == 0:
+               return
+       irq_record = irq_dic[cpu].pop()
+       if 'event_list' in irq_record.keys():
+               irq_event_list = irq_record['event_list']
+       else:
+               irq_event_list = []
+       irq_event_list.append({'time':time, 'event':'sirq_raise'})
+       irq_record.update({'event_list':irq_event_list})
+       irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_entry(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]}
+
+def handle_irq_softirq_exit(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       irq_list = []
+       event_list = 0
+       if cpu in irq_dic.keys():
+               irq_list = irq_dic[cpu]
+               del irq_dic[cpu]
+       if cpu in net_rx_dic.keys():
+               sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t']
+               event_list = net_rx_dic[cpu]['event_list']
+               del net_rx_dic[cpu]
+       if irq_list == [] or event_list == 0:
+               return
+       rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
+                   'irq_list':irq_list, 'event_list':event_list}
+       # merge information realted to a NET_RX softirq
+       receive_hunk_list.append(rec_data)
+
+def handle_napi_poll(event_info):
+       (name, context, cpu, time, pid, comm, napi, dev_name) = event_info
+       if cpu in net_rx_dic.keys():
+               event_list = net_rx_dic[cpu]['event_list']
+               rec_data = {'event_name':'napi_poll',
+                               'dev':dev_name, 'event_t':time}
+               event_list.append(rec_data)
+
+def handle_netif_rx(event_info):
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       if cpu not in irq_dic.keys() \
+       or len(irq_dic[cpu]) == 0:
+               return
+       irq_record = irq_dic[cpu].pop()
+       if 'event_list' in irq_record.keys():
+               irq_event_list = irq_record['event_list']
+       else:
+               irq_event_list = []
+       irq_event_list.append({'time':time, 'event':'netif_rx',
+               'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name})
+       irq_record.update({'event_list':irq_event_list})
+       irq_dic[cpu].append(irq_record)
+
+def handle_netif_receive_skb(event_info):
+       global of_count_rx_skb_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       if cpu in net_rx_dic.keys():
+               rec_data = {'event_name':'netif_receive_skb',
+                           'event_t':time, 'skbaddr':skbaddr, 'len':skblen}
+               event_list = net_rx_dic[cpu]['event_list']
+               event_list.append(rec_data)
+               rx_skb_list.insert(0, rec_data)
+               if len(rx_skb_list) > buffer_budget:
+                       rx_skb_list.pop()
+                       of_count_rx_skb_list += 1
+
+def handle_net_dev_queue(event_info):
+       global of_count_tx_queue_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time}
+       tx_queue_list.insert(0, skb)
+       if len(tx_queue_list) > buffer_budget:
+               tx_queue_list.pop()
+               of_count_tx_queue_list += 1
+
+def handle_net_dev_xmit(event_info):
+       global of_count_tx_xmit_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, rc, dev_name) = event_info
+       if rc == 0: # NETDEV_TX_OK
+               for i in range(len(tx_queue_list)):
+                       skb = tx_queue_list[i]
+                       if skb['skbaddr'] == skbaddr:
+                               skb['xmit_t'] = time
+                               tx_xmit_list.insert(0, skb)
+                               del tx_queue_list[i]
+                               if len(tx_xmit_list) > buffer_budget:
+                                       tx_xmit_list.pop()
+                                       of_count_tx_xmit_list += 1
+                               return
+
+def handle_kfree_skb(event_info):
+       (name, context, cpu, time, pid, comm,
+               skbaddr, protocol, location) = event_info
+       for i in range(len(tx_queue_list)):
+               skb = tx_queue_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       del tx_queue_list[i]
+                       return
+       for i in range(len(tx_xmit_list)):
+               skb = tx_xmit_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       skb['free_t'] = time
+                       tx_free_list.append(skb)
+                       del tx_xmit_list[i]
+                       return
+       for i in range(len(rx_skb_list)):
+               rec_data = rx_skb_list[i]
+               if rec_data['skbaddr'] == skbaddr:
+                       rec_data.update({'handle':"kfree_skb",
+                                       'comm':comm, 'pid':pid, 'comm_t':time})
+                       del rx_skb_list[i]
+                       return
+
+def handle_consume_skb(event_info):
+       (name, context, cpu, time, pid, comm, skbaddr) = event_info
+       for i in range(len(tx_xmit_list)):
+               skb = tx_xmit_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       skb['free_t'] = time
+                       tx_free_list.append(skb)
+                       del tx_xmit_list[i]
+                       return
+
+def handle_skb_copy_datagram_iovec(event_info):
+       (name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info
+       for i in range(len(rx_skb_list)):
+               rec_data = rx_skb_list[i]
+               if skbaddr == rec_data['skbaddr']:
+                       rec_data.update({'handle':"skb_copy_datagram_iovec",
+                                       'comm':comm, 'pid':pid, 'comm_t':time})
+                       del rx_skb_list[i]
+                       return
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index 27e9ebe4076e0efbf4117a46f2dbbcc74a1dcc3e..a7729797fd96254bc35326077337a71f919c19b5 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -82,6 +82,8 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2
  extern char *perf_pathdup(const char *fmt, ...)
         __attribute__((format (printf, 1, 2)));
  
+#ifdef NO_STRLCPY
  extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
  
  #endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

index f231f43424d27930a286cb52902c21cb4534a068..e12d539417b2cc4644e2d5a919cfb8a23e8ae163 100644 (file)
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -28,6 +28,9 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
  #define chain_for_each_child(child, parent)    \
         list_for_each_entry(child, &parent->children, brothers)
  
+#define chain_for_each_child_safe(child, next, parent) \
+       list_for_each_entry_safe(child, next, &parent->children, brothers)
+
  static void
  rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
                     enum chain_mode mode)
@@ -86,10 +89,10 @@ __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
   * sort them by hit
   */
  static void
-sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
+sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root,
                 u64 min_hit, struct callchain_param *param __used)
  {
-       __sort_chain_flat(rb_root, node, min_hit);
+       __sort_chain_flat(rb_root, &root->node, min_hit);
  }
  
  static void __sort_chain_graph_abs(struct callchain_node *node,
@@ -108,11 +111,11 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
  }
  
  static void
-sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root,
                      u64 min_hit, struct callchain_param *param __used)
  {
-       __sort_chain_graph_abs(chain_root, min_hit);
-       rb_root->rb_node = chain_root->rb_root.rb_node;
+       __sort_chain_graph_abs(&chain_root->node, min_hit);
+       rb_root->rb_node = chain_root->node.rb_root.rb_node;
  }
  
  static void __sort_chain_graph_rel(struct callchain_node *node,
@@ -133,11 +136,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
  }
  
  static void
-sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
                      u64 min_hit __used, struct callchain_param *param)
  {
-       __sort_chain_graph_rel(chain_root, param->min_percent / 100.0);
-       rb_root->rb_node = chain_root->rb_root.rb_node;
+       __sort_chain_graph_rel(&chain_root->node, param->min_percent / 100.0);
+       rb_root->rb_node = chain_root->node.rb_root.rb_node;
  }
  
  int register_callchain_param(struct callchain_param *param)
@@ -284,19 +287,18 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
  }
  
  static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-              unsigned int start, u64 period);
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+            unsigned int start, u64 period);
  
  static void
-__append_chain_children(struct callchain_node *root,
-                       struct resolved_chain *chain,
-                       unsigned int start, u64 period)
+append_chain_children(struct callchain_node *root, struct resolved_chain *chain,
+                     unsigned int start, u64 period)
  {
         struct callchain_node *rnode;
  
         /* lookup in childrens */
         chain_for_each_child(rnode, root) {
-               unsigned int ret = __append_chain(rnode, chain, start, period);
+               unsigned int ret = append_chain(rnode, chain, start, period);
  
                 if (!ret)
                         goto inc_children_hit;
@@ -309,8 +311,8 @@ inc_children_hit:
  }
  
  static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-              unsigned int start, u64 period)
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+            unsigned int start, u64 period)
  {
         struct callchain_list *cnode;
         unsigned int i = start;
@@ -357,7 +359,7 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain,
         }
  
         /* We match the node and still have a part remaining */
-       __append_chain_children(root, chain, i, period);
+       append_chain_children(root, chain, i, period);
  
         return 0;
  }
@@ -380,8 +382,8 @@ static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
  }
  
  
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-                struct map_symbol *syms, u64 period)
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+                    struct map_symbol *syms, u64 period)
  {
         struct resolved_chain *filtered;
  
@@ -398,9 +400,65 @@ int append_chain(struct callchain_node *root, struct ip_callchain *chain,
         if (!filtered->nr)
                 goto end;
  
-       __append_chain_children(root, filtered, 0, period);
+       append_chain_children(&root->node, filtered, 0, period);
+
+       if (filtered->nr > root->max_depth)
+               root->max_depth = filtered->nr;
  end:
         free(filtered);
  
         return 0;
  }
+
+static int
+merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
+                  struct resolved_chain *chain)
+{
+       struct callchain_node *child, *next_child;
+       struct callchain_list *list, *next_list;
+       int old_pos = chain->nr;
+       int err = 0;
+
+       list_for_each_entry_safe(list, next_list, &src->val, list) {
+               chain->ips[chain->nr].ip = list->ip;
+               chain->ips[chain->nr].ms = list->ms;
+               chain->nr++;
+               list_del(&list->list);
+               free(list);
+       }
+
+       if (src->hit)
+               append_chain_children(dst, chain, 0, src->hit);
+
+       chain_for_each_child_safe(child, next_child, src) {
+               err = merge_chain_branch(dst, child, chain);
+               if (err)
+                       break;
+
+               list_del(&child->brothers);
+               free(child);
+       }
+
+       chain->nr = old_pos;
+
+       return err;
+}
+
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src)
+{
+       struct resolved_chain *chain;
+       int err;
+
+       chain = malloc(sizeof(*chain) +
+                      src->max_depth * sizeof(struct resolved_ip));
+       if (!chain)
+               return -ENOMEM;
+
+       chain->nr = 0;
+
+       err = merge_chain_branch(&dst->node, &src->node, chain);
+
+       free(chain);
+
+       return err;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h

index 6de4313924fb5c510f354cb3a8e2a73061e103f6..c15fb8c24ad2b87388e97cd6346cfdebaac11dd5 100644 (file)
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -26,9 +26,14 @@ struct callchain_node {
         u64                     children_hit;
  };
  
+struct callchain_root {
+       u64                     max_depth;
+       struct callchain_node   node;
+};
+
  struct callchain_param;
  
-typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_node *,
+typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
                                  u64, struct callchain_param *);
  
  struct callchain_param {
@@ -44,15 +49,16 @@ struct callchain_list {
         struct list_head        list;
  };
  
-static inline void callchain_init(struct callchain_node *node)
+static inline void callchain_init(struct callchain_root *root)
  {
-       INIT_LIST_HEAD(&node->brothers);
-       INIT_LIST_HEAD(&node->children);
-       INIT_LIST_HEAD(&node->val);
+       INIT_LIST_HEAD(&root->node.brothers);
+       INIT_LIST_HEAD(&root->node.children);
+       INIT_LIST_HEAD(&root->node.val);
  
-       node->children_hit = 0;
-       node->parent = NULL;
-       node->hit = 0;
+       root->node.parent = NULL;
+       root->node.hit = 0;
+       root->node.children_hit = 0;
+       root->max_depth = 0;
  }
  
  static inline u64 cumul_hits(struct callchain_node *node)
@@ -61,8 +67,9 @@ static inline u64 cumul_hits(struct callchain_node *node)
  }
  
  int register_callchain_param(struct callchain_param *param);
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-                struct map_symbol *syms, u64 period);
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+                    struct map_symbol *syms, u64 period);
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src);
  
  bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
  #endif /* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index be22ae6ef0558009c0a1faaa4f55bcf2c5d828ca..2022e87409942ca4b0d133c3f889e41178a663d1 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -87,7 +87,7 @@ static void hist_entry__add_cpumode_period(struct hist_entry *self,
  
  static struct hist_entry *hist_entry__new(struct hist_entry *template)
  {
-       size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_node) : 0;
+       size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_root) : 0;
         struct hist_entry *self = malloc(sizeof(*self) + callchain_size);
  
         if (self != NULL) {
@@ -226,6 +226,8 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
  
                 if (!cmp) {
                         iter->period += he->period;
+                       if (symbol_conf.use_callchain)
+                               callchain_merge(iter->callchain, he->callchain);
                         hist_entry__free(he);
                         return false;
                 }
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c

index 58a470d036dd0917c16eda49fb8b1987703ca7b5..bd74977114242ff465af39a291d30aa7d463f3b2 100644 (file)
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -22,6 +22,7 @@ static const char *get_perf_dir(void)
         return ".";
  }
  
+#ifdef NO_STRLCPY
  size_t strlcpy(char *dest, const char *src, size_t size)
  {
         size_t ret = strlen(src);
@@ -33,7 +34,7 @@ size_t strlcpy(char *dest, const char *src, size_t size)
         }
         return ret;
  }
-
+#endif
  
  static char *get_pathname(void)
  {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 46e531d09e8bfcbe1064a4307ce5a75ef72a6405..0b91053a7d11af888eea81a4c8de24fdd60ce6f8 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -70,7 +70,7 @@ struct hist_entry {
                 struct hist_entry *pair;
                 struct rb_root    sorted_chain;
         };
-       struct callchain_node   callchain[0];
+       struct callchain_root   callchain[0];
  };
  
  enum sort_type {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index b2f5ae97f33dded1cdaba3e843ab2df888515777..b39f499e575a604198bf1bb11d11d6280a091548 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -388,6 +388,20 @@ size_t dso__fprintf_buildid(struct dso *self, FILE *fp)
         return fprintf(fp, "%s", sbuild_id);
  }
  
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp)
+{
+       size_t ret = 0;
+       struct rb_node *nd;
+       struct symbol_name_rb_node *pos;
+
+       for (nd = rb_first(&self->symbol_names[type]); nd; nd = rb_next(nd)) {
+               pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+               fprintf(fp, "%s\n", pos->sym.name);
+       }
+
+       return ret;
+}
+
  size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp)
  {
         struct rb_node *nd;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index ea95c2756f05e73d09f6868f24b174ed1221f59a..038f2201ee09579ca3f460d9f59576770ea477d2 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -182,6 +182,7 @@ size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
  size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
  
  size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp);
  size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
  
  enum dso_origin {
diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c

index 66f2d583d8c4326971dc9d37cd47df57eeb82306..6d0df809a2edab24f28af4bea093d1cda3c2614d 100644 (file)
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -1,16 +1,6 @@
-#define _GNU_SOURCE
-#include <stdio.h>
-#undef _GNU_SOURCE
-/*
- * slang versions <= 2.0.6 have a "#if HAVE_LONG_LONG" that breaks
- * the build if it isn't defined. Use the equivalent one that glibc
- * has on features.h.
- */
-#include <features.h>
-#ifndef HAVE_LONG_LONG
-#define HAVE_LONG_LONG __GLIBC_HAVE_LONG_LONG
-#endif
  #include <slang.h>
+#include "libslang.h"
+#include <linux/compiler.h>
  #include <linux/list.h>
  #include <linux/rbtree.h>
  #include <stdlib.h>
@@ -19,17 +9,9 @@
  #include "helpline.h"
  #include "../color.h"
  #include "../util.h"
+#include <stdio.h>
  
-#if SLANG_VERSION < 20104
-#define sltt_set_color(obj, name, fg, bg) \
-       SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg)
-#else
-#define sltt_set_color SLtt_set_color
-#endif
-
-newtComponent newt_form__new(void);
-
-int ui_browser__percent_color(double percent, bool current)
+static int ui_browser__percent_color(double percent, bool current)
  {
         if (current)
                 return HE_COLORSET_SELECTED;
@@ -40,6 +22,23 @@ int ui_browser__percent_color(double percent, bool current)
         return HE_COLORSET_NORMAL;
  }
  
+void ui_browser__set_color(struct ui_browser *self __used, int color)
+{
+       SLsmg_set_color(color);
+}
+
+void ui_browser__set_percent_color(struct ui_browser *self,
+                                  double percent, bool current)
+{
+        int color = ui_browser__percent_color(percent, current);
+        ui_browser__set_color(self, color);
+}
+
+void ui_browser__gotorc(struct ui_browser *self, int y, int x)
+{
+       SLsmg_gotorc(self->y + y, self->x + x);
+}
+
  void ui_browser__list_head_seek(struct ui_browser *self, off_t offset, int whence)
  {
         struct list_head *head = self->entries;
@@ -111,7 +110,7 @@ unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self)
         nd = self->top;
  
         while (nd != NULL) {
-               SLsmg_gotorc(self->y + row, self->x);
+               ui_browser__gotorc(self, row, 0);
                 self->write(self, nd, row);
                 if (++row == self->height)
                         break;
@@ -131,13 +130,10 @@ void ui_browser__refresh_dimensions(struct ui_browser *self)
         int cols, rows;
         newtGetScreenSize(&cols, &rows);
  
-       if (self->width > cols - 4)
-               self->width = cols - 4;
-       self->height = rows - 5;
-       if (self->height > self->nr_entries)
-               self->height = self->nr_entries;
-       self->y  = (rows - self->height) / 2;
-       self->x = (cols - self->width) / 2;
+       self->width = cols - 1;
+       self->height = rows - 2;
+       self->y = 1;
+       self->x = 0;
  }
  
  void ui_browser__reset_index(struct ui_browser *self)
@@ -146,34 +142,48 @@ void ui_browser__reset_index(struct ui_browser *self)
         self->seek(self, 0, SEEK_SET);
  }
  
+void ui_browser__add_exit_key(struct ui_browser *self, int key)
+{
+       newtFormAddHotKey(self->form, key);
+}
+
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
+{
+       int i = 0;
+
+       while (keys[i] && i < 64) {
+               ui_browser__add_exit_key(self, keys[i]);
+               ++i;
+       }
+}
+
  int ui_browser__show(struct ui_browser *self, const char *title,
                      const char *helpline, ...)
  {
         va_list ap;
+       int keys[] = { NEWT_KEY_UP, NEWT_KEY_DOWN, NEWT_KEY_PGUP,
+                      NEWT_KEY_PGDN, NEWT_KEY_HOME, NEWT_KEY_END, ' ',
+                      NEWT_KEY_LEFT, NEWT_KEY_ESCAPE, 'q', CTRL('c'), 0 };
  
-       if (self->form != NULL) {
+       if (self->form != NULL)
                 newtFormDestroy(self->form);
-               newtPopWindow();
-       }
+
         ui_browser__refresh_dimensions(self);
-       newtCenteredWindow(self->width, self->height, title);
-       self->form = newt_form__new();
+       self->form = newtForm(NULL, NULL, 0);
         if (self->form == NULL)
                 return -1;
  
-       self->sb = newtVerticalScrollbar(self->width, 0, self->height,
+       self->sb = newtVerticalScrollbar(self->width, 1, self->height,
                                          HE_COLORSET_NORMAL,
                                          HE_COLORSET_SELECTED);
         if (self->sb == NULL)
                 return -1;
  
-       newtFormAddHotKey(self->form, NEWT_KEY_UP);
-       newtFormAddHotKey(self->form, NEWT_KEY_DOWN);
-       newtFormAddHotKey(self->form, NEWT_KEY_PGUP);
-       newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
-       newtFormAddHotKey(self->form, NEWT_KEY_HOME);
-       newtFormAddHotKey(self->form, NEWT_KEY_END);
-       newtFormAddHotKey(self->form, ' ');
+       SLsmg_gotorc(0, 0);
+       ui_browser__set_color(self, NEWT_COLORSET_ROOT);
+       slsmg_write_nstring(title, self->width);
+
+       ui_browser__add_exit_keys(self, keys);
         newtFormAddComponent(self->form, self->sb);
  
         va_start(ap, helpline);
@@ -185,7 +195,6 @@ int ui_browser__show(struct ui_browser *self, const char *title,
  void ui_browser__hide(struct ui_browser *self)
  {
         newtFormDestroy(self->form);
-       newtPopWindow();
         self->form = NULL;
         ui_helpline__pop();
  }
@@ -196,28 +205,28 @@ int ui_browser__refresh(struct ui_browser *self)
  
         newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
         row = self->refresh(self);
-       SLsmg_set_color(HE_COLORSET_NORMAL);
+       ui_browser__set_color(self, HE_COLORSET_NORMAL);
         SLsmg_fill_region(self->y + row, self->x,
                           self->height - row, self->width, ' ');
  
         return 0;
  }
  
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
+int ui_browser__run(struct ui_browser *self)
  {
+       struct newtExitStruct es;
+
         if (ui_browser__refresh(self) < 0)
                 return -1;
  
         while (1) {
                 off_t offset;
  
-               newtFormRun(self->form, es);
+               newtFormRun(self->form, &es);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
+               if (es.reason != NEWT_EXIT_HOTKEY)
                         break;
-               if (is_exit_key(es->u.key))
-                       return es->u.key;
-               switch (es->u.key) {
+               switch (es.u.key) {
                 case NEWT_KEY_DOWN:
                         if (self->index == self->nr_entries - 1)
                                 break;
@@ -274,12 +283,12 @@ int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
                         self->seek(self, -offset, SEEK_END);
                         break;
                 default:
-                       return es->u.key;
+                       return es.u.key;
                 }
                 if (ui_browser__refresh(self) < 0)
                         return -1;
         }
-       return 0;
+       return -1;
  }
  
  unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
@@ -294,7 +303,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
         pos = self->top;
  
         list_for_each_from(pos, head) {
-               SLsmg_gotorc(self->y + row, self->x);
+               ui_browser__gotorc(self, row, 0);
                 self->write(self, pos, row);
                 if (++row == self->height)
                         break;
diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h

index 0b9f829214f756ec16227745835cfea55d7ee503..0dc7e4da36f52c42ef3574dc89dce7102ae8438d 100644 (file)
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h
@@ -25,16 +25,21 @@ struct ui_browser {
  };
  
  
-int ui_browser__percent_color(double percent, bool current);
+void ui_browser__set_color(struct ui_browser *self, int color);
+void ui_browser__set_percent_color(struct ui_browser *self,
+                                  double percent, bool current);
  bool ui_browser__is_current_entry(struct ui_browser *self, unsigned row);
  void ui_browser__refresh_dimensions(struct ui_browser *self);
  void ui_browser__reset_index(struct ui_browser *self);
  
+void ui_browser__gotorc(struct ui_browser *self, int y, int x);
+void ui_browser__add_exit_key(struct ui_browser *self, int key);
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
  int ui_browser__show(struct ui_browser *self, const char *title,
                      const char *helpline, ...);
  void ui_browser__hide(struct ui_browser *self);
  int ui_browser__refresh(struct ui_browser *self);
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es);
+int ui_browser__run(struct ui_browser *self);
  
  void ui_browser__rb_tree_seek(struct ui_browser *self, off_t offset, int whence);
  unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self);
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c

index a90273e63f4fb6939ea64e074513e1afabb1f289..82b78f99251bb2b764165cf8066a85f1e6e4b97d 100644 (file)
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -40,14 +40,12 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
  
         if (ol->offset != -1) {
                 struct objdump_line_rb_node *olrb = objdump_line__rb(ol);
-               int color = ui_browser__percent_color(olrb->percent, current_entry);
-               SLsmg_set_color(color);
+               ui_browser__set_percent_color(self, olrb->percent, current_entry);
                 slsmg_printf(" %7.2f ", olrb->percent);
                 if (!current_entry)
-                       SLsmg_set_color(HE_COLORSET_CODE);
+                       ui_browser__set_color(self, HE_COLORSET_CODE);
         } else {
-               int color = ui_browser__percent_color(0, current_entry);
-               SLsmg_set_color(color);
+               ui_browser__set_percent_color(self, 0, current_entry);
                 slsmg_write_nstring(" ", 9);
         }
  
@@ -135,32 +133,31 @@ static void annotate_browser__set_top(struct annotate_browser *self,
         self->curr_hot = nd;
  }
  
-static int annotate_browser__run(struct annotate_browser *self,
-                                struct newtExitStruct *es)
+static int annotate_browser__run(struct annotate_browser *self)
  {
         struct rb_node *nd;
         struct hist_entry *he = self->b.priv;
+       int key;
  
         if (ui_browser__show(&self->b, he->ms.sym->name,
-                            "<- or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
+                            "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
                 return -1;
-
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
+       /*
+        * To allow builtin-annotate to cycle thru multiple symbols by
+        * examining the exit key for this function.
+        */
+       ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT);
  
         nd = self->curr_hot;
         if (nd) {
-               newtFormAddHotKey(self->b.form, NEWT_KEY_TAB);
-               newtFormAddHotKey(self->b.form, NEWT_KEY_UNTAB);
+               int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 };
+               ui_browser__add_exit_keys(&self->b, tabs);
         }
  
         while (1) {
-               ui_browser__run(&self->b, es);
-
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
+               key = ui_browser__run(&self->b);
  
-               switch (es->u.key) {
+               switch (key) {
                 case NEWT_KEY_TAB:
                         nd = rb_prev(nd);
                         if (nd == NULL)
@@ -179,12 +176,11 @@ static int annotate_browser__run(struct annotate_browser *self,
         }
  out:
         ui_browser__hide(&self->b);
-       return es->u.key;
+       return key;
  }
  
  int hist_entry__tui_annotate(struct hist_entry *self)
  {
-       struct newtExitStruct es;
         struct objdump_line *pos, *n;
         struct objdump_line_rb_node *rbpos;
         LIST_HEAD(head);
@@ -232,7 +228,7 @@ int hist_entry__tui_annotate(struct hist_entry *self)
                 annotate_browser__set_top(&browser, browser.curr_hot);
  
         browser.b.width += 18; /* Percentage */
-       ret = annotate_browser__run(&browser, &es);
+       ret = annotate_browser__run(&browser);
         list_for_each_entry_safe(pos, n, &head, node) {
                 list_del(&pos->node);
                 objdump_line__free(pos);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c

index dafdf6775d77f44d69abf1980b1a9cfe4ab053dc..ebda8c3fde9e6468ddbd84fc2df7e324ba862854 100644 (file)
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -58,6 +58,11 @@ static char callchain_list__folded(const struct callchain_list *self)
         return map_symbol__folded(&self->ms);
  }
  
+static void map_symbol__set_folding(struct map_symbol *self, bool unfold)
+{
+       self->unfolded = unfold ? self->has_children : false;
+}
+
  static int callchain_node__count_rows_rb_tree(struct callchain_node *self)
  {
         int n = 0;
@@ -129,16 +134,16 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *se
         for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
                 struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
                 struct callchain_list *chain;
-               int first = true;
+               bool first = true;
  
                 list_for_each_entry(chain, &child->val, list) {
                         if (first) {
                                 first = false;
                                 chain->ms.has_children = chain->list.next != &child->val ||
-                                                        rb_first(&child->rb_root) != NULL;
+                                                        !RB_EMPTY_ROOT(&child->rb_root);
                         } else
                                 chain->ms.has_children = chain->list.next == &child->val &&
-                                                        rb_first(&child->rb_root) != NULL;
+                                                        !RB_EMPTY_ROOT(&child->rb_root);
                 }
  
                 callchain_node__init_have_children_rb_tree(child);
@@ -150,7 +155,7 @@ static void callchain_node__init_have_children(struct callchain_node *self)
         struct callchain_list *chain;
  
         list_for_each_entry(chain, &self->val, list)
-               chain->ms.has_children = rb_first(&self->rb_root) != NULL;
+               chain->ms.has_children = !RB_EMPTY_ROOT(&self->rb_root);
  
         callchain_node__init_have_children_rb_tree(self);
  }
@@ -168,6 +173,7 @@ static void callchain__init_have_children(struct rb_root *self)
  static void hist_entry__init_have_children(struct hist_entry *self)
  {
         if (!self->init_have_children) {
+               self->ms.has_children = !RB_EMPTY_ROOT(&self->sorted_chain);
                 callchain__init_have_children(&self->sorted_chain);
                 self->init_have_children = true;
         }
@@ -195,43 +201,114 @@ static bool hist_browser__toggle_fold(struct hist_browser *self)
         return false;
  }
  
-static int hist_browser__run(struct hist_browser *self, const char *title,
-                            struct newtExitStruct *es)
+static int callchain_node__set_folding_rb_tree(struct callchain_node *self, bool unfold)
+{
+       int n = 0;
+       struct rb_node *nd;
+
+       for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
+               struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
+               struct callchain_list *chain;
+               bool has_children = false;
+
+               list_for_each_entry(chain, &child->val, list) {
+                       ++n;
+                       map_symbol__set_folding(&chain->ms, unfold);
+                       has_children = chain->ms.has_children;
+               }
+
+               if (has_children)
+                       n += callchain_node__set_folding_rb_tree(child, unfold);
+       }
+
+       return n;
+}
+
+static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
+{
+       struct callchain_list *chain;
+       bool has_children = false;
+       int n = 0;
+
+       list_for_each_entry(chain, &node->val, list) {
+               ++n;
+               map_symbol__set_folding(&chain->ms, unfold);
+               has_children = chain->ms.has_children;
+       }
+
+       if (has_children)
+               n += callchain_node__set_folding_rb_tree(node, unfold);
+
+       return n;
+}
+
+static int callchain__set_folding(struct rb_root *chain, bool unfold)
+{
+       struct rb_node *nd;
+       int n = 0;
+
+       for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
+               struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
+               n += callchain_node__set_folding(node, unfold);
+       }
+
+       return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *self, bool unfold)
+{
+       hist_entry__init_have_children(self);
+       map_symbol__set_folding(&self->ms, unfold);
+
+       if (self->ms.has_children) {
+               int n = callchain__set_folding(&self->sorted_chain, unfold);
+               self->nr_rows = unfold ? n : 0;
+       } else
+               self->nr_rows = 0;
+}
+
+static void hists__set_folding(struct hists *self, bool unfold)
+{
+       struct rb_node *nd;
+
+       self->nr_entries = 0;
+
+       for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
+               hist_entry__set_folding(he, unfold);
+               self->nr_entries += 1 + he->nr_rows;
+       }
+}
+
+static void hist_browser__set_folding(struct hist_browser *self, bool unfold)
+{
+       hists__set_folding(self->hists, unfold);
+       self->b.nr_entries = self->hists->nr_entries;
+       /* Go to the start, we may be way after valid entries after a collapse */
+       ui_browser__reset_index(&self->b);
+}
+
+static int hist_browser__run(struct hist_browser *self, const char *title)
  {
-       char str[256], unit;
-       unsigned long nr_events = self->hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       int key;
+       int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
+                           NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, };
  
         self->b.entries = &self->hists->entries;
         self->b.nr_entries = self->hists->nr_entries;
  
         hist_browser__refresh_dimensions(self);
  
-       nr_events = convert_unit(nr_events, &unit);
-       snprintf(str, sizeof(str), "Events: %lu%c                            ",
-                nr_events, unit);
-       newtDrawRootText(0, 0, str);
-
         if (ui_browser__show(&self->b, title,
                              "Press '?' for help on key bindings") < 0)
                 return -1;
  
-       newtFormAddHotKey(self->b.form, 'a');
-       newtFormAddHotKey(self->b.form, '?');
-       newtFormAddHotKey(self->b.form, 'h');
-       newtFormAddHotKey(self->b.form, 'd');
-       newtFormAddHotKey(self->b.form, 'D');
-       newtFormAddHotKey(self->b.form, 't');
-
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
+       ui_browser__add_exit_keys(&self->b, exit_keys);
  
         while (1) {
-               ui_browser__run(&self->b, es);
+               key = ui_browser__run(&self->b);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
-               switch (es->u.key) {
+               switch (key) {
                 case 'D': { /* Debug */
                         static int seq;
                         struct hist_entry *h = rb_entry(self->b.top,
@@ -245,18 +322,26 @@ static int hist_browser__run(struct hist_browser *self, const char *title,
                                            self->b.top_idx,
                                            h->row_offset, h->nr_rows);
                 }
-                       continue;
+                       break;
+               case 'C':
+                       /* Collapse the whole world. */
+                       hist_browser__set_folding(self, false);
+                       break;
+               case 'E':
+                       /* Expand the whole world. */
+                       hist_browser__set_folding(self, true);
+                       break;
                 case NEWT_KEY_ENTER:
                         if (hist_browser__toggle_fold(self))
                                 break;
                         /* fall thru */
                 default:
-                       return 0;
+                       goto out;
                 }
         }
-
+out:
         ui_browser__hide(&self->b);
-       return 0;
+       return key;
  }
  
  static char *callchain_list__sym_name(struct callchain_list *self,
@@ -306,15 +391,10 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
                         int color;
                         bool was_first = first;
  
-                       if (first) {
+                       if (first)
                                 first = false;
-                               chain->ms.has_children = chain->list.next != &child->val ||
-                                                        rb_first(&child->rb_root) != NULL;
-                       } else {
+                       else
                                 extra_offset = LEVEL_OFFSET_STEP;
-                               chain->ms.has_children = chain->list.next == &child->val &&
-                                                        rb_first(&child->rb_root) != NULL;
-                       }
  
                         folded_sign = callchain_list__folded(chain);
                         if (*row_offset != 0) {
@@ -341,8 +421,8 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
                                 *is_current_entry = true;
                         }
  
-                       SLsmg_set_color(color);
-                       SLsmg_gotorc(self->b.y + row, self->b.x);
+                       ui_browser__set_color(&self->b, color);
+                       ui_browser__gotorc(&self->b, row, 0);
                         slsmg_write_nstring(" ", offset + extra_offset);
                         slsmg_printf("%c ", folded_sign);
                         slsmg_write_nstring(str, width);
@@ -384,12 +464,7 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
         list_for_each_entry(chain, &node->val, list) {
                 char ipstr[BITS_PER_LONG / 4 + 1], *s;
                 int color;
-               /*
-                * FIXME: This should be moved to somewhere else,
-                * probably when the callchain is created, so as not to
-                * traverse it all over again
-                */
-               chain->ms.has_children = rb_first(&node->rb_root) != NULL;
+
                 folded_sign = callchain_list__folded(chain);
  
                 if (*row_offset != 0) {
@@ -405,8 +480,8 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
                 }
  
                 s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
-               SLsmg_gotorc(self->b.y + row, self->b.x);
-               SLsmg_set_color(color);
+               ui_browser__gotorc(&self->b, row, 0);
+               ui_browser__set_color(&self->b, color);
                 slsmg_write_nstring(" ", offset);
                 slsmg_printf("%c ", folded_sign);
                 slsmg_write_nstring(s, width - 2);
@@ -465,7 +540,7 @@ static int hist_browser__show_entry(struct hist_browser *self,
         }
  
         if (symbol_conf.use_callchain) {
-               entry->ms.has_children = !RB_EMPTY_ROOT(&entry->sorted_chain);
+               hist_entry__init_have_children(entry);
                 folded_sign = hist_entry__folded(entry);
         }
  
@@ -484,8 +559,8 @@ static int hist_browser__show_entry(struct hist_browser *self,
                                 color = HE_COLORSET_NORMAL;
                 }
  
-               SLsmg_set_color(color);
-               SLsmg_gotorc(self->b.y + row, self->b.x);
+               ui_browser__set_color(&self->b, color);
+               ui_browser__gotorc(&self->b, row, 0);
                 if (symbol_conf.use_callchain) {
                         slsmg_printf("%c ", folded_sign);
                         width -= 2;
@@ -687,8 +762,6 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
  
  static void hist_browser__delete(struct hist_browser *self)
  {
-       newtFormDestroy(self->b.form);
-       newtPopWindow();
         free(self);
  }
  
@@ -702,21 +775,26 @@ static struct thread *hist_browser__selected_thread(struct hist_browser *self)
         return self->he_selection->thread;
  }
  
-static int hist_browser__title(char *bf, size_t size, const char *ev_name,
-                              const struct dso *dso, const struct thread *thread)
+static int hists__browser_title(struct hists *self, char *bf, size_t size,
+                               const char *ev_name, const struct dso *dso,
+                               const struct thread *thread)
  {
-       int printed = 0;
+       char unit;
+       int printed;
+       unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
+
+       nr_events = convert_unit(nr_events, &unit);
+       printed = snprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name);
  
         if (thread)
                 printed += snprintf(bf + printed, size - printed,
-                                   "Thread: %s(%d)",
-                                   (thread->comm_set ?  thread->comm : ""),
+                                   ", Thread: %s(%d)",
+                                   (thread->comm_set ? thread->comm : ""),
                                     thread->pid);
         if (dso)
                 printed += snprintf(bf + printed, size - printed,
-                                   "%sDSO: %s", thread ? " " : "",
-                                   dso->short_name);
-       return printed ?: snprintf(bf, size, "Event: %s", ev_name);
+                                   ", DSO: %s", dso->short_name);
+       return printed;
  }
  
  int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
@@ -725,7 +803,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
         struct pstack *fstack;
         const struct thread *thread_filter = NULL;
         const struct dso *dso_filter = NULL;
-       struct newtExitStruct es;
         char msg[160];
         int key = -1;
  
@@ -738,9 +815,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
  
         ui_helpline__push(helpline);
  
-       hist_browser__title(msg, sizeof(msg), ev_name,
-                           dso_filter, thread_filter);
-
+       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                            dso_filter, thread_filter);
         while (1) {
                 const struct thread *thread;
                 const struct dso *dso;
@@ -749,70 +825,63 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
                     annotate = -2, zoom_dso = -2, zoom_thread = -2,
                     browse_map = -2;
  
-               if (hist_browser__run(browser, msg, &es))
-                       break;
+               key = hist_browser__run(browser, msg);
  
                 thread = hist_browser__selected_thread(browser);
                 dso = browser->selection->map ? browser->selection->map->dso : NULL;
  
-               if (es.reason == NEWT_EXIT_HOTKEY) {
-                       key = es.u.key;
-
-                       switch (key) {
-                       case NEWT_KEY_F1:
-                               goto do_help;
-                       case NEWT_KEY_TAB:
-                       case NEWT_KEY_UNTAB:
-                               /*
-                                * Exit the browser, let hists__browser_tree
-                                * go to the next or previous
-                                */
-                               goto out_free_stack;
-                       default:;
-                       }
-
-                       switch (key) {
-                       case 'a':
-                               if (browser->selection->map == NULL &&
-                                   browser->selection->map->dso->annotate_warned)
-                                       continue;
-                               goto do_annotate;
-                       case 'd':
-                               goto zoom_dso;
-                       case 't':
-                               goto zoom_thread;
-                       case 'h':
-                       case '?':
-do_help:
-                               ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
-                                               "<-        Zoom out\n"
-                                               "a         Annotate current symbol\n"
-                                               "h/?/F1    Show this window\n"
-                                               "d         Zoom into current DSO\n"
-                                               "t         Zoom into current Thread\n"
-                                               "q/CTRL+C  Exit browser");
+               switch (key) {
+               case NEWT_KEY_TAB:
+               case NEWT_KEY_UNTAB:
+                       /*
+                        * Exit the browser, let hists__browser_tree
+                        * go to the next or previous
+                        */
+                       goto out_free_stack;
+               case 'a':
+                       if (browser->selection->map == NULL &&
+                           browser->selection->map->dso->annotate_warned)
                                 continue;
-                       default:;
-                       }
-                       if (is_exit_key(key)) {
-                               if (key == NEWT_KEY_ESCAPE &&
-                                   !ui__dialog_yesno("Do you really want to exit?"))
-                                       continue;
-                               break;
-                       }
-
-                       if (es.u.key == NEWT_KEY_LEFT) {
-                               const void *top;
+                       goto do_annotate;
+               case 'd':
+                       goto zoom_dso;
+               case 't':
+                       goto zoom_thread;
+               case NEWT_KEY_F1:
+               case 'h':
+               case '?':
+                       ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
+                                       "<-        Zoom out\n"
+                                       "a         Annotate current symbol\n"
+                                       "h/?/F1    Show this window\n"
+                                       "C         Collapse all callchains\n"
+                                       "E         Expand all callchains\n"
+                                       "d         Zoom into current DSO\n"
+                                       "t         Zoom into current Thread\n"
+                                       "q/CTRL+C  Exit browser");
+                       continue;
+               case NEWT_KEY_ENTER:
+               case NEWT_KEY_RIGHT:
+                       /* menu */
+                       break;
+               case NEWT_KEY_LEFT: {
+                       const void *top;
  
-                               if (pstack__empty(fstack))
-                                       continue;
-                               top = pstack__pop(fstack);
-                               if (top == &dso_filter)
-                                       goto zoom_out_dso;
-                               if (top == &thread_filter)
-                                       goto zoom_out_thread;
+                       if (pstack__empty(fstack))
                                 continue;
-                       }
+                       top = pstack__pop(fstack);
+                       if (top == &dso_filter)
+                               goto zoom_out_dso;
+                       if (top == &thread_filter)
+                               goto zoom_out_thread;
+                       continue;
+               }
+               case NEWT_KEY_ESCAPE:
+                       if (!ui__dialog_yesno("Do you really want to exit?"))
+                               continue;
+                       /* Fall thru */
+               default:
+                       goto out_free_stack;
                 }
  
                 if (browser->selection->sym != NULL &&
@@ -885,8 +954,8 @@ zoom_out_dso:
                                 pstack__push(fstack, &dso_filter);
                         }
                         hists__filter_by_dso(self, dso_filter);
-                       hist_browser__title(msg, sizeof(msg), ev_name,
-                                           dso_filter, thread_filter);
+                       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                                            dso_filter, thread_filter);
                         hist_browser__reset(browser);
                 } else if (choice == zoom_thread) {
  zoom_thread:
@@ -903,8 +972,8 @@ zoom_out_thread:
                                 pstack__push(fstack, &thread_filter);
                         }
                         hists__filter_by_thread(self, thread_filter);
-                       hist_browser__title(msg, sizeof(msg), ev_name,
-                                           dso_filter, thread_filter);
+                       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                                            dso_filter, thread_filter);
                         hist_browser__reset(browser);
                 }
         }
@@ -925,10 +994,6 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
                 const char *ev_name = __event_name(hists->type, hists->config);
  
                 key = hists__browse(hists, help, ev_name);
-
-               if (is_exit_key(key))
-                       break;
-
                 switch (key) {
                 case NEWT_KEY_TAB:
                         next = rb_next(nd);
@@ -940,7 +1005,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
                                 continue;
                         nd = rb_prev(nd);
                 default:
-                       break;
+                       return key;
                 }
         }
  
diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c

index 142b825b42bf41d90ee013f51aaac5e79df44f5e..e35437dfa5b48aea8a0fb0237b2bf7ed7aaf90cd 100644 (file)
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c
@@ -1,6 +1,5 @@
  #include "../libslang.h"
  #include <elf.h>
-#include <newt.h>
  #include <sys/ttydefaults.h>
  #include <ctype.h>
  #include <string.h>
@@ -47,7 +46,6 @@ out_free_form:
  struct map_browser {
         struct ui_browser b;
         struct map        *map;
-       u16               namelen;
         u8                addrlen;
  };
  
@@ -56,14 +54,16 @@ static void map_browser__write(struct ui_browser *self, void *nd, int row)
         struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
         struct map_browser *mb = container_of(self, struct map_browser, b);
         bool current_entry = ui_browser__is_current_entry(self, row);
-       int color = ui_browser__percent_color(0, current_entry);
+       int width;
  
-       SLsmg_set_color(color);
+       ui_browser__set_percent_color(self, 0, current_entry);
         slsmg_printf("%*llx %*llx %c ",
                      mb->addrlen, sym->start, mb->addrlen, sym->end,
                      sym->binding == STB_GLOBAL ? 'g' :
                      sym->binding == STB_LOCAL  ? 'l' : 'w');
-       slsmg_write_nstring(sym->name, mb->namelen);
+       width = self->width - ((mb->addrlen * 2) + 4);
+       if (width > 0)
+               slsmg_write_nstring(sym->name, width);
  }
  
  /* FIXME uber-kludgy, see comment on cmd_report... */
@@ -98,31 +98,29 @@ static int map_browser__search(struct map_browser *self)
         return 0;
  }
  
-static int map_browser__run(struct map_browser *self, struct newtExitStruct *es)
+static int map_browser__run(struct map_browser *self)
  {
+       int key;
+
         if (ui_browser__show(&self->b, self->map->dso->long_name,
                              "Press <- or ESC to exit, %s / to search",
                              verbose ? "" : "restart with -v to use") < 0)
                 return -1;
  
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
         if (verbose)
-               newtFormAddHotKey(self->b.form, '/');
+               ui_browser__add_exit_key(&self->b, '/');
  
         while (1) {
-               ui_browser__run(&self->b, es);
+               key = ui_browser__run(&self->b);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
-               if (verbose && es->u.key == '/')
+               if (verbose && key == '/')
                         map_browser__search(self);
                 else
                         break;
         }
  
         ui_browser__hide(&self->b);
-       return 0;
+       return key;
  }
  
  int map__browse(struct map *self)
@@ -136,7 +134,6 @@ int map__browse(struct map *self)
                 },
                 .map = self,
         };
-       struct newtExitStruct es;
         struct rb_node *nd;
         char tmp[BITS_PER_LONG / 4];
         u64 maxaddr = 0;
@@ -144,8 +141,6 @@ int map__browse(struct map *self)
         for (nd = rb_first(mb.b.entries); nd; nd = rb_next(nd)) {
                 struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
  
-               if (mb.namelen < pos->namelen)
-                       mb.namelen = pos->namelen;
                 if (maxaddr < pos->end)
                         maxaddr = pos->end;
                 if (verbose) {
@@ -156,6 +151,5 @@ int map__browse(struct map *self)
         }
  
         mb.addrlen = snprintf(tmp, sizeof(tmp), "%llx", maxaddr);
-       mb.b.width += mb.addrlen * 2 + 4 + mb.namelen;
-       return map_browser__run(&mb, &es);
+       return map_browser__run(&mb);
  }
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c

index 04600e26ceea21d08b701570494c5e72210aaa9e..9706d9d40279859321412b270c8ac3053141f4f9 100644 (file)
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -11,8 +11,6 @@
  #include "helpline.h"
  #include "util.h"
  
-newtComponent newt_form__new(void);
-
  static void newt_form__set_exit_keys(newtComponent self)
  {
         newtFormAddHotKey(self, NEWT_KEY_LEFT);
@@ -22,7 +20,7 @@ static void newt_form__set_exit_keys(newtComponent self)
         newtFormAddHotKey(self, CTRL('c'));
  }
  
-newtComponent newt_form__new(void)
+static newtComponent newt_form__new(void)
  {
         newtComponent self = newtForm(NULL, NULL, 0);
         if (self)
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index f380fed74359034a843756256d6e7a79b0ff22b5..7562707ddd1c491755dc8ea5121637918ba1b844 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -266,19 +266,6 @@ bool strglobmatch(const char *str, const char *pat);
  bool strlazymatch(const char *str, const char *pat);
  unsigned long convert_unit(unsigned long value, char *unit);
  
-#ifndef ESC
-#define ESC 27
-#endif
-
-static inline bool is_exit_key(int key)
-{
-       char up;
-       if (key == CTRL('c') || key == ESC)
-               return true;
-       up = toupper(key);
-       return up == 'Q';
-}
-
  #define _STR(x) #x
  #define STR(x) _STR(x)
author	Ingo Molnar <mingo@elte.hu>
	Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 23 Sep 2010 06:02:09 +0000 (08:02 +0200)
arch/alpha/kernel/perf_event.c		patch \| blob \| history
arch/arm/kernel/perf_event.c		patch \| blob \| history
arch/arm/oprofile/common.c		patch \| blob \| history
arch/powerpc/kernel/perf_callchain.c		patch \| blob \| history
arch/powerpc/kernel/perf_event.c		patch \| blob \| history
arch/powerpc/kernel/perf_event_fsl_emb.c		patch \| blob \| history
arch/sh/kernel/perf_callchain.c		patch \| blob \| history
arch/sh/kernel/perf_event.c		patch \| blob \| history
arch/sparc/kernel/perf_event.c		patch \| blob \| history
arch/x86/include/asm/perf_event_p4.h		patch \| blob \| history
arch/x86/kernel/cpu/perf_event.c		patch \| blob \| history
arch/x86/kernel/cpu/perf_event_intel.c		patch \| blob \| history
arch/x86/kernel/cpu/perf_event_intel_ds.c		patch \| blob \| history
arch/x86/kernel/cpu/perf_event_p4.c		patch \| blob \| history
arch/x86/kernel/kprobes.c		patch \| blob \| history
include/asm-generic/hardirq.h		patch \| blob \| history
include/linux/ftrace_event.h		patch \| blob \| history
include/linux/interrupt.h		patch \| blob \| history
include/linux/percpu.h		patch \| blob \| history
include/linux/perf_event.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/trace/events/irq.h		patch \| blob \| history
include/trace/events/napi.h		patch \| blob \| history
include/trace/events/net.h	[new file with mode: 0644]	patch \| blob
include/trace/events/power.h		patch \| blob \| history
include/trace/events/skb.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/hw_breakpoint.c		patch \| blob \| history
kernel/kprobes.c		patch \| blob \| history
kernel/perf_event.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/trace/ftrace.c		patch \| blob \| history
kernel/trace/ring_buffer.c		patch \| blob \| history
kernel/trace/trace_event_perf.c		patch \| blob \| history
kernel/trace/trace_events.c		patch \| blob \| history
kernel/trace/trace_functions_graph.c		patch \| blob \| history
kernel/watchdog.c		patch \| blob \| history
lib/Kconfig.debug		patch \| blob \| history
net/core/datagram.c		patch \| blob \| history
net/core/dev.c		patch \| blob \| history
net/core/net-traces.c		patch \| blob \| history
net/core/skbuff.c		patch \| blob \| history
tools/perf/Documentation/perf-annotate.txt		patch \| blob \| history
tools/perf/Documentation/perf-report.txt		patch \| blob \| history
tools/perf/Makefile		patch \| blob \| history
tools/perf/builtin-annotate.c		patch \| blob \| history
tools/perf/builtin-report.c		patch \| blob \| history
tools/perf/feature-tests.mak		patch \| blob \| history
tools/perf/scripts/python/bin/netdev-times-record	[new file with mode: 0644]	patch \| blob
tools/perf/scripts/python/bin/netdev-times-report	[new file with mode: 0644]	patch \| blob
tools/perf/scripts/python/netdev-times.py	[new file with mode: 0644]	patch \| blob
tools/perf/util/cache.h		patch \| blob \| history
tools/perf/util/callchain.c		patch \| blob \| history
tools/perf/util/callchain.h		patch \| blob \| history
tools/perf/util/hist.c		patch \| blob \| history
tools/perf/util/path.c		patch \| blob \| history
tools/perf/util/sort.h		patch \| blob \| history
tools/perf/util/symbol.c		patch \| blob \| history
tools/perf/util/symbol.h		patch \| blob \| history
tools/perf/util/ui/browser.c		patch \| blob \| history
tools/perf/util/ui/browser.h		patch \| blob \| history
tools/perf/util/ui/browsers/annotate.c		patch \| blob \| history
tools/perf/util/ui/browsers/hists.c		patch \| blob \| history
tools/perf/util/ui/browsers/map.c		patch \| blob \| history
tools/perf/util/ui/util.c		patch \| blob \| history
tools/perf/util/util.h		patch \| blob \| history