perf: Store active software events in a hashlist
Frederic Weisbecker [Mon, 5 Apr 2010 13:35:57 +0000 (15:35 +0200)]
Each time a software event triggers, we need to walk through
the entire list of events from the current cpu and task contexts
to retrieve a running perf event that matches.
We also need to check a matching perf event is actually counting.

This walk is wasteful and makes the event fast path scaling
down with a growing number of events running on the same
contexts.

To solve this, we store the running perf events in a hashlist to
get an immediate access to them against their type:event_id when
they trigger.

v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic
      maths along the way)
    - Only allocate hlist for online cpus, but keep track of the
      refcount on offline possible cpus too, so that we allocate it
      if needed when it becomes online.
    - Drop the kref use as it's not adapted to our tricks anymore.

v3: - Fix bad refcount check (address instead of value). Thanks to
      Eric Dumazet who spotted this.
    - While exiting cpu, move the hlist release out of the IPI path
      to lock the hlist mutex sanely.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>

include/linux/perf_event.h
kernel/perf_event.c

index 6e96cc8..bf896d0 100644 (file)
@@ -589,6 +589,14 @@ enum perf_group_flag {
        PERF_GROUP_SOFTWARE = 0x1,
 };
 
+#define SWEVENT_HLIST_BITS     8
+#define SWEVENT_HLIST_SIZE     (1 << SWEVENT_HLIST_BITS)
+
+struct swevent_hlist {
+       struct hlist_head       heads[SWEVENT_HLIST_SIZE];
+       struct rcu_head         rcu_head;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -597,6 +605,7 @@ struct perf_event {
        struct list_head                group_entry;
        struct list_head                event_entry;
        struct list_head                sibling_list;
+       struct hlist_node               hlist_entry;
        int                             nr_siblings;
        int                             group_flags;
        struct perf_event               *group_leader;
@@ -744,6 +753,9 @@ struct perf_cpu_context {
        int                             active_oncpu;
        int                             max_pertask;
        int                             exclusive;
+       struct swevent_hlist            *swevent_hlist;
+       struct mutex                    hlist_mutex;
+       int                             hlist_refcount;
 
        /*
         * Recursion avoidance:
index fcf42dc..9efdfe5 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-       /*
-        * The event is active, we're good!
-        */
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
-               return 1;
-
-       /*
-        * The event is off/error, not counting.
-        */
-       if (event->state != PERF_EVENT_STATE_INACTIVE)
-               return 0;
-
-       /*
-        * The event is inactive, if the context is active
-        * we're part of a group that didn't make it on the 'pmu',
-        * not counting.
-        */
-       if (event->ctx->is_active)
-               return 0;
-
-       /*
-        * We're inactive and the context is too, this means the
-        * task is scheduled out, we're counting events that happen
-        * to us, like migration events.
-        */
-       return 1;
-}
-
 static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data);
 
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-       if (event->cpu != -1 && event->cpu != smp_processor_id())
-               return 0;
-
-       if (!perf_swevent_is_counting(event))
-               return 0;
-
        if (event->attr.type != type)
                return 0;
 
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
        return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-                                    enum perf_type_id type,
-                                    u32 event_id, u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+       u64 val = event_id | (type << 32);
+
+       return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+       u64 hash;
+       struct swevent_hlist *hlist;
+
+       hash = swevent_hash(type, event_id);
+
+       hlist = rcu_dereference(ctx->swevent_hlist);
+       if (!hlist)
+               return NULL;
+
+       return &hlist->heads[hash];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                   u64 nr, int nmi,
+                                   struct perf_sample_data *data,
+                                   struct pt_regs *regs)
 {
+       struct perf_cpu_context *cpuctx;
        struct perf_event *event;
+       struct hlist_node *node;
+       struct hlist_head *head;
 
-       list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+       cpuctx = &__get_cpu_var(perf_cpu_context);
+
+       rcu_read_lock();
+
+       head = find_swevent_head(cpuctx, type, event_id);
+
+       if (!head)
+               goto end;
+
+       hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
+end:
+       rcu_read_unlock();
 }
 
 int perf_swevent_get_recursion_context(void)
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                   u64 nr, int nmi,
-                                   struct perf_sample_data *data,
-                                   struct pt_regs *regs)
-{
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
-
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-       rcu_read_lock();
-       perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-                                nr, nmi, data, regs);
-       /*
-        * doesn't really matter which of the child contexts the
-        * events ends up in.
-        */
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-       rcu_read_unlock();
-}
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+       struct perf_cpu_context *cpuctx;
+       struct hlist_head *head;
+
+       cpuctx = &__get_cpu_var(perf_cpu_context);
 
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
+
+       head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+       if (WARN_ON_ONCE(!head))
+               return -EINVAL;
+
+       hlist_add_head_rcu(&event->hlist_entry, head);
+
        return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+       hlist_del_rcu(&event->hlist_entry);
 }
 
 static const struct pmu perf_ops_generic = {
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
        return 0;
 }
 
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+       struct swevent_hlist *hlist;
+
+       hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+       kfree(hlist);
+}
+
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+       struct swevent_hlist *hlist;
+
+       if (!cpuctx->swevent_hlist)
+               return;
+
+       hlist = cpuctx->swevent_hlist;
+       rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+       call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+       mutex_lock(&cpuctx->hlist_mutex);
+
+       if (!--cpuctx->hlist_refcount)
+               swevent_hlist_release(cpuctx);
+
+       mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+       int cpu;
+
+       if (event->cpu != -1) {
+               swevent_hlist_put_cpu(event, event->cpu);
+               return;
+       }
+
+       for_each_possible_cpu(cpu)
+               swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       int err = 0;
+
+       mutex_lock(&cpuctx->hlist_mutex);
+
+       if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+               if (!hlist) {
+                       err = -ENOMEM;
+                       goto exit;
+               }
+               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+       }
+       cpuctx->hlist_refcount++;
+ exit:
+       mutex_unlock(&cpuctx->hlist_mutex);
+
+       return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+       int err;
+       int cpu, failed_cpu;
+
+       if (event->cpu != -1)
+               return swevent_hlist_get_cpu(event, event->cpu);
+
+       get_online_cpus();
+       for_each_possible_cpu(cpu) {
+               err = swevent_hlist_get_cpu(event, cpu);
+               if (err) {
+                       failed_cpu = cpu;
+                       goto fail;
+               }
+       }
+       put_online_cpus();
+
+       return 0;
+ fail:
+       for_each_possible_cpu(cpu) {
+               if (cpu == failed_cpu)
+                       break;
+               swevent_hlist_put_cpu(event, cpu);
+       }
+
+       put_online_cpus();
+       return err;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
        perf_trace_disable(event->attr.config);
+       swevent_hlist_put(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+       int err;
+
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                return NULL;
 
        event->destroy = tp_perf_event_destroy;
+       err = swevent_hlist_get(event);
+       if (err) {
+               perf_trace_disable(event->attr.config);
+               return ERR_PTR(err);
+       }
 
        return &perf_ops_generic;
 }
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
 
        atomic_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
+                       int err;
+
+                       err = swevent_hlist_get(event);
+                       if (err)
+                               return ERR_PTR(err);
+
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
                }
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
 
        for_each_possible_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
+               mutex_init(&cpuctx->hlist_mutex);
                __perf_event_init_context(&cpuctx->ctx, NULL);
        }
 }
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
+
+       mutex_lock(&cpuctx->hlist_mutex);
+       if (cpuctx->hlist_refcount > 0) {
+               struct swevent_hlist *hlist;
+
+               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+               WARN_ON_ONCE(!hlist);
+               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+       }
+       mutex_unlock(&cpuctx->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_event_context *ctx = &cpuctx->ctx;
 
+       mutex_lock(&cpuctx->hlist_mutex);
+       swevent_hlist_release(cpuctx);
+       mutex_unlock(&cpuctx->hlist_mutex);
+
        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
        mutex_unlock(&ctx->mutex);