performance counters: core code
Thomas Gleixner [Thu, 4 Dec 2008 19:12:29 +0000 (20:12 +0100)]
Implement the core kernel bits of Performance Counters subsystem.

The Linux Performance Counter subsystem provides an abstraction of
performance counter hardware capabilities. It provides per task and per
CPU counters, and it provides event capabilities on top of those.

Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.

The special file descriptor is opened via the perf_counter_open()
system call:

 int
 perf_counter_open(u32 hw_event_type,
                   u32 hw_event_period,
                   u32 record_type,
                   pid_t pid,
                   int cpu);

The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
can be used to set the blocking mode, etc.

Multiple counters can be kept open at a time, and the counters
can be poll()ed.

See more details in Documentation/perf-counters.txt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

drivers/char/sysrq.c
include/linux/perf_counter.h [new file with mode: 0644]
include/linux/sched.h
include/linux/syscalls.h
init/Kconfig
kernel/Makefile
kernel/fork.c
kernel/perf_counter.c [new file with mode: 0644]
kernel/sched.c
kernel/sys_ni.c

index ce0d9da..52146c2 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/kbd_kern.h>
 #include <linux/proc_fs.h>
 #include <linux/quotaops.h>
+#include <linux/perf_counter.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
        struct pt_regs *regs = get_irq_regs();
        if (regs)
                show_regs(regs);
+       perf_counter_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
        .handler        = sysrq_handle_showregs,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644 (file)
index 0000000..22c4469
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ *  Performance counters:
+ *
+ *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <asm/atomic.h>
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+struct task_struct;
+
+/*
+ * Generalized hardware event types, used by the hw_event_type parameter
+ * of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+       PERF_COUNT_CYCLES,
+       PERF_COUNT_INSTRUCTIONS,
+       PERF_COUNT_CACHE_REFERENCES,
+       PERF_COUNT_CACHE_MISSES,
+       PERF_COUNT_BRANCH_INSTRUCTIONS,
+       PERF_COUNT_BRANCH_MISSES,
+       /*
+        * If this bit is set in the type, then trigger NMI sampling:
+        */
+       PERF_COUNT_NMI                  = (1 << 30),
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_record_type {
+       PERF_RECORD_SIMPLE,
+       PERF_RECORD_IRQ,
+       PERF_RECORD_GROUP,
+};
+
+/**
+ * struct hw_perf_counter - performance counter hardware details
+ */
+struct hw_perf_counter {
+       u64                     config;
+       unsigned long           config_base;
+       unsigned long           counter_base;
+       int                     nmi;
+       unsigned int            idx;
+       u64                     prev_count;
+       s32                     next_count;
+       u64                     irq_period;
+};
+
+/*
+ * Hardcoded buffer length limit for now, for IRQ-fed events:
+ */
+#define PERF_DATA_BUFLEN       2048
+
+/**
+ * struct perf_data - performance counter IRQ data sampling ...
+ */
+struct perf_data {
+       int                     len;
+       int                     rd_idx;
+       int                     overrun;
+       u8                      data[PERF_DATA_BUFLEN];
+};
+
+/**
+ * struct perf_counter - performance counter kernel representation:
+ */
+struct perf_counter {
+       struct list_head                list;
+       int                             active;
+#if BITS_PER_LONG == 64
+       atomic64_t                      count;
+#else
+       atomic_t                        count32[2];
+#endif
+       u64                             __irq_period;
+
+       struct hw_perf_counter          hw;
+
+       struct perf_counter_context     *ctx;
+       struct task_struct              *task;
+
+       /*
+        * Protect attach/detach:
+        */
+       struct mutex                    mutex;
+
+       int                             oncpu;
+       int                             cpu;
+
+       s32                             hw_event_type;
+       enum perf_record_type           record_type;
+
+       /* read() / irq related data */
+       wait_queue_head_t               waitq;
+       /* optional: for NMIs */
+       int                             wakeup_pending;
+       struct perf_data                *irqdata;
+       struct perf_data                *usrdata;
+       struct perf_data                data[2];
+};
+
+/**
+ * struct perf_counter_context - counter context structure
+ *
+ * Used as a container for task counters and CPU counters as well:
+ */
+struct perf_counter_context {
+#ifdef CONFIG_PERF_COUNTERS
+       /*
+        * Protect the list of counters:
+        */
+       spinlock_t              lock;
+       struct list_head        counters;
+       int                     nr_counters;
+       int                     nr_active;
+       struct task_struct      *task;
+#endif
+};
+
+/**
+ * struct perf_counter_cpu_context - per cpu counter context structure
+ */
+struct perf_cpu_context {
+       struct perf_counter_context     ctx;
+       struct perf_counter_context     *task_ctx;
+       int                             active_oncpu;
+       int                             max_pertask;
+};
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_counters;
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
+extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_print_debug(void);
+#else
+static inline void
+perf_counter_task_sched_in(struct task_struct *task, int cpu)          { }
+static inline void
+perf_counter_task_sched_out(struct task_struct *task, int cpu)         { }
+static inline void
+perf_counter_task_tick(struct task_struct *task, int cpu)              { }
+static inline void perf_counter_init_task(struct task_struct *task)    { }
+static inline void perf_counter_notify(struct pt_regs *regs)           { }
+static inline void perf_counter_print_debug(void)                      { }
+#endif
+
+#endif /* _LINUX_PERF_COUNTER_H */
index 55e30d1..4c53027 100644 (file)
@@ -71,6 +71,7 @@ struct sched_param {
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -1326,6 +1327,7 @@ struct task_struct {
        struct list_head pi_state_list;
        struct futex_pi_state *pi_state_cache;
 #endif
+       struct perf_counter_context perf_counter_ctx;
 #ifdef CONFIG_NUMA
        struct mempolicy *mempolicy;
        short il_next;
@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)      TASK_SIZE
 #endif
 
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+                                    void (*func) (void *info), void *info);
+
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
index 04fb47b..6cce728 100644 (file)
@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+                     u32 hw_event_period,
+                     u32 record_type,
+                     pid_t pid,
+                     int cpu);
 #endif
index f763762..78bede2 100644 (file)
@@ -732,6 +732,35 @@ config AIO
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 
+config HAVE_PERF_COUNTERS
+       bool
+
+menu "Performance Counters"
+
+config PERF_COUNTERS
+       bool "Kernel Performance Counters"
+       depends on HAVE_PERF_COUNTERS
+       default y
+       help
+         Enable kernel support for performance counter hardware.
+
+         Performance counters are special hardware registers available
+         on most modern CPUs. These registers count the number of certain
+         types of hw events: such as instructions executed, cachemisses
+         suffered, or branches mis-predicted - without slowing down the
+         kernel or applications. These registers can also trigger interrupts
+         when a threshold number of events have passed - and can thus be
+         used to profile the code that runs on that CPU.
+
+         The Linux Performance Counter subsystem provides an abstraction of
+         these hardware capabilities, available via a system call. It
+         provides per task and per CPU counters, and it provides event
+         capabilities on top of those.
+
+         Say Y if unsure.
+
+endmenu
+
 config VM_EVENT_COUNTERS
        default y
        bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
index 19fad00..1f184a1 100644 (file)
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
index 2a372a0..441fadf 100644 (file)
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
 
        rt_mutex_init_task(p);
+       perf_counter_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644 (file)
index 0000000..20508f0
--- /dev/null
@@ -0,0 +1,943 @@
+/*
+ * Performance counter core code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/perf_counter.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+/*
+ * Mutex for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_MUTEX(perf_resource_mutex);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+
+int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+{
+       return -EINVAL;
+}
+
+void __weak hw_perf_counter_enable(struct perf_counter *counter)        { }
+void __weak hw_perf_counter_disable(struct perf_counter *counter)       { }
+void __weak hw_perf_counter_read(struct perf_counter *counter)          { }
+void __weak hw_perf_disable_all(void) { }
+void __weak hw_perf_enable_all(void) { }
+void __weak hw_perf_counter_setup(void) { }
+
+#if BITS_PER_LONG == 64
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 64 bit version - no complications.
+ */
+static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+       return (u64) atomic64_read(&counter->count);
+}
+
+#else
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 32 bit version.
+ */
+static u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+       u32 cntl, cnth;
+
+       local_irq_disable();
+       do {
+               cnth = atomic_read(&counter->count32[1]);
+               cntl = atomic_read(&counter->count32[0]);
+       } while (cnth != atomic_read(&counter->count32[1]));
+
+       local_irq_enable();
+
+       return cntl | ((u64) cnth) << 32;
+}
+
+#endif
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_remove_from_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+
+       if (counter->active) {
+               hw_perf_counter_disable(counter);
+               counter->active = 0;
+               ctx->nr_active--;
+               cpuctx->active_oncpu--;
+               counter->task = NULL;
+       }
+       ctx->nr_counters--;
+
+       /*
+        * Protect the list operation against NMI by disabling the
+        * counters on a global level. NOP for non NMI based counters.
+        */
+       hw_perf_disable_all();
+       list_del_init(&counter->list);
+       hw_perf_enable_all();
+
+       if (!ctx->task) {
+               /*
+                * Allow more per task counters with respect to the
+                * reservation:
+                */
+               cpuctx->max_pertask =
+                       min(perf_max_counters - ctx->nr_counters,
+                           perf_max_counters - perf_reserved_percpu);
+       }
+
+       spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with counter->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ */
+static void perf_remove_from_context(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Per cpu counters are removed via an smp call and
+                * the removal is always sucessful.
+                */
+               smp_call_function_single(counter->cpu,
+                                        __perf_remove_from_context,
+                                        counter, 1);
+               return;
+       }
+
+retry:
+       task_oncpu_function_call(task, __perf_remove_from_context,
+                                counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the context is active we need to retry the smp call.
+        */
+       if (ctx->nr_active && !list_empty(&counter->list)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can remove the counter safely, if it the call above did not
+        * succeed.
+        */
+       if (!list_empty(&counter->list)) {
+               ctx->nr_counters--;
+               list_del_init(&counter->list);
+               counter->task = NULL;
+       }
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to install and enable a preformance counter
+ */
+static void __perf_install_in_context(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+       int cpu = smp_processor_id();
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       spin_lock(&ctx->lock);
+
+       /*
+        * Protect the list operation against NMI by disabling the
+        * counters on a global level. NOP for non NMI based counters.
+        */
+       hw_perf_disable_all();
+       list_add_tail(&counter->list, &ctx->counters);
+       hw_perf_enable_all();
+
+       ctx->nr_counters++;
+
+       if (cpuctx->active_oncpu < perf_max_counters) {
+               hw_perf_counter_enable(counter);
+               counter->active = 1;
+               counter->oncpu = cpu;
+               ctx->nr_active++;
+               cpuctx->active_oncpu++;
+       }
+
+       if (!ctx->task && cpuctx->max_pertask)
+               cpuctx->max_pertask--;
+
+       spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+                       struct perf_counter *counter,
+                       int cpu)
+{
+       struct task_struct *task = ctx->task;
+
+       counter->ctx = ctx;
+       if (!task) {
+               /*
+                * Per cpu counters are installed via an smp call and
+                * the install is always sucessful.
+                */
+               smp_call_function_single(cpu, __perf_install_in_context,
+                                        counter, 1);
+               return;
+       }
+
+       counter->task = task;
+retry:
+       task_oncpu_function_call(task, __perf_install_in_context,
+                                counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the context is active and the counter has not been added
+        * we need to retry the smp call.
+        */
+       if (ctx->nr_active && list_empty(&counter->list)) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * The lock prevents that this context is scheduled in so we
+        * can add the counter safely, if it the call above did not
+        * succeed.
+        */
+       if (list_empty(&counter->list)) {
+               list_add_tail(&counter->list, &ctx->counters);
+               ctx->nr_counters++;
+       }
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &task->perf_counter_ctx;
+       struct perf_counter *counter;
+
+       if (likely(!cpuctx->task_ctx))
+               return;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(counter, &ctx->counters, list) {
+               if (!ctx->nr_active)
+                       break;
+               if (counter->active) {
+                       hw_perf_counter_disable(counter);
+                       counter->active = 0;
+                       counter->oncpu = -1;
+                       ctx->nr_active--;
+                       cpuctx->active_oncpu--;
+               }
+       }
+       spin_unlock(&ctx->lock);
+       cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &task->perf_counter_ctx;
+       struct perf_counter *counter;
+
+       if (likely(!ctx->nr_counters))
+               return;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(counter, &ctx->counters, list) {
+               if (ctx->nr_active == cpuctx->max_pertask)
+                       break;
+               if (counter->cpu != -1 && counter->cpu != cpu)
+                       continue;
+
+               hw_perf_counter_enable(counter);
+               counter->active = 1;
+               counter->oncpu = cpu;
+               ctx->nr_active++;
+               cpuctx->active_oncpu++;
+       }
+       spin_unlock(&ctx->lock);
+       cpuctx->task_ctx = ctx;
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+       struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+       struct perf_counter *counter;
+
+       if (likely(!ctx->nr_counters))
+               return;
+
+       perf_counter_task_sched_out(curr, cpu);
+
+       spin_lock(&ctx->lock);
+
+       /*
+        * Rotate the first entry last:
+        */
+       hw_perf_disable_all();
+       list_for_each_entry(counter, &ctx->counters, list) {
+               list_del(&counter->list);
+               list_add_tail(&counter->list, &ctx->counters);
+               break;
+       }
+       hw_perf_enable_all();
+
+       spin_unlock(&ctx->lock);
+
+       perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *task)
+{
+       struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+       spin_lock_init(&ctx->lock);
+       INIT_LIST_HEAD(&ctx->counters);
+       ctx->nr_counters = 0;
+       ctx->task = task;
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __hw_perf_counter_read(void *info)
+{
+       hw_perf_counter_read(info);
+}
+
+static u64 perf_read_counter(struct perf_counter *counter)
+{
+       /*
+        * If counter is enabled and currently active on a CPU, update the
+        * value in the counter structure:
+        */
+       if (counter->active) {
+               smp_call_function_single(counter->oncpu,
+                                        __hw_perf_counter_read, counter, 1);
+       }
+
+       return perf_read_counter_safe(counter);
+}
+
+/*
+ * Cross CPU call to switch performance data pointers
+ */
+static void __perf_switch_irq_data(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter *counter = info;
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_data *oldirqdata = counter->irqdata;
+
+       /*
+        * If this is a task context, we need to check whether it is
+        * the current task context of this cpu. If not it has been
+        * scheduled out before the smp call arrived.
+        */
+       if (ctx->task) {
+               if (cpuctx->task_ctx != ctx)
+                       return;
+               spin_lock(&ctx->lock);
+       }
+
+       /* Change the pointer NMI safe */
+       atomic_long_set((atomic_long_t *)&counter->irqdata,
+                       (unsigned long) counter->usrdata);
+       counter->usrdata = oldirqdata;
+
+       if (ctx->task)
+               spin_unlock(&ctx->lock);
+}
+
+static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_data *oldirqdata = counter->irqdata;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               smp_call_function_single(counter->cpu,
+                                        __perf_switch_irq_data,
+                                        counter, 1);
+               return counter->usrdata;
+       }
+
+retry:
+       spin_lock_irq(&ctx->lock);
+       if (!counter->active) {
+               counter->irqdata = counter->usrdata;
+               counter->usrdata = oldirqdata;
+               spin_unlock_irq(&ctx->lock);
+               return oldirqdata;
+       }
+       spin_unlock_irq(&ctx->lock);
+       task_oncpu_function_call(task, __perf_switch_irq_data, counter);
+       /* Might have failed, because task was scheduled out */
+       if (counter->irqdata == oldirqdata)
+               goto retry;
+
+       return counter->usrdata;
+}
+
+static void put_context(struct perf_counter_context *ctx)
+{
+       if (ctx->task)
+               put_task_struct(ctx->task);
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+       struct perf_counter_context *ctx;
+       struct task_struct *task;
+
+       /*
+        * If cpu is not a wildcard then this is a percpu counter:
+        */
+       if (cpu != -1) {
+               /* Must be root to operate on a CPU counter: */
+               if (!capable(CAP_SYS_ADMIN))
+                       return ERR_PTR(-EACCES);
+
+               if (cpu < 0 || cpu > num_possible_cpus())
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * We could be clever and allow to attach a counter to an
+                * offline CPU and activate it when the CPU comes up, but
+                * that's for later.
+                */
+               if (!cpu_isset(cpu, cpu_online_map))
+                       return ERR_PTR(-ENODEV);
+
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+
+               WARN_ON_ONCE(ctx->task);
+               return ctx;
+       }
+
+       rcu_read_lock();
+       if (!pid)
+               task = current;
+       else
+               task = find_task_by_vpid(pid);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+
+       if (!task)
+               return ERR_PTR(-ESRCH);
+
+       ctx = &task->perf_counter_ctx;
+       ctx->task = task;
+
+       /* Reuse ptrace permission checks for now. */
+       if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+               put_context(ctx);
+               return ERR_PTR(-EACCES);
+       }
+
+       return ctx;
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+       struct perf_counter *counter = file->private_data;
+       struct perf_counter_context *ctx = counter->ctx;
+
+       file->private_data = NULL;
+
+       mutex_lock(&counter->mutex);
+
+       perf_remove_from_context(counter);
+       put_context(ctx);
+
+       mutex_unlock(&counter->mutex);
+
+       kfree(counter);
+
+       return 0;
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+       u64 cntval;
+
+       if (count != sizeof(cntval))
+               return -EINVAL;
+
+       mutex_lock(&counter->mutex);
+       cntval = perf_read_counter(counter);
+       mutex_unlock(&counter->mutex);
+
+       return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+}
+
+static ssize_t
+perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
+{
+       if (!usrdata->len)
+               return 0;
+
+       count = min(count, (size_t)usrdata->len);
+       if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
+               return -EFAULT;
+
+       /* Adjust the counters */
+       usrdata->len -= count;
+       if (!usrdata->len)
+               usrdata->rd_idx = 0;
+       else
+               usrdata->rd_idx += count;
+
+       return count;
+}
+
+static ssize_t
+perf_read_irq_data(struct perf_counter *counter,
+                  char __user          *buf,
+                  size_t               count,
+                  int                  nonblocking)
+{
+       struct perf_data *irqdata, *usrdata;
+       DECLARE_WAITQUEUE(wait, current);
+       ssize_t res;
+
+       irqdata = counter->irqdata;
+       usrdata = counter->usrdata;
+
+       if (usrdata->len + irqdata->len >= count)
+               goto read_pending;
+
+       if (nonblocking)
+               return -EAGAIN;
+
+       spin_lock_irq(&counter->waitq.lock);
+       __add_wait_queue(&counter->waitq, &wait);
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (usrdata->len + irqdata->len >= count)
+                       break;
+
+               if (signal_pending(current))
+                       break;
+
+               spin_unlock_irq(&counter->waitq.lock);
+               schedule();
+               spin_lock_irq(&counter->waitq.lock);
+       }
+       __remove_wait_queue(&counter->waitq, &wait);
+       __set_current_state(TASK_RUNNING);
+       spin_unlock_irq(&counter->waitq.lock);
+
+       if (usrdata->len + irqdata->len < count)
+               return -ERESTARTSYS;
+read_pending:
+       mutex_lock(&counter->mutex);
+
+       /* Drain pending data first: */
+       res = perf_copy_usrdata(usrdata, buf, count);
+       if (res < 0 || res == count)
+               goto out;
+
+       /* Switch irq buffer: */
+       usrdata = perf_switch_irq_data(counter);
+       if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+               if (!res)
+                       res = -EFAULT;
+       } else {
+               res = count;
+       }
+out:
+       mutex_unlock(&counter->mutex);
+
+       return res;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+       struct perf_counter *counter = file->private_data;
+
+       switch (counter->record_type) {
+       case PERF_RECORD_SIMPLE:
+               return perf_read_hw(counter, buf, count);
+
+       case PERF_RECORD_IRQ:
+       case PERF_RECORD_GROUP:
+               return perf_read_irq_data(counter, buf, count,
+                                         file->f_flags & O_NONBLOCK);
+       }
+       return -EINVAL;
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+       struct perf_counter *counter = file->private_data;
+       unsigned int events = 0;
+       unsigned long flags;
+
+       poll_wait(file, &counter->waitq, wait);
+
+       spin_lock_irqsave(&counter->waitq.lock, flags);
+       if (counter->usrdata->len || counter->irqdata->len)
+               events |= POLLIN;
+       spin_unlock_irqrestore(&counter->waitq.lock, flags);
+
+       return events;
+}
+
+static const struct file_operations perf_fops = {
+       .release                = perf_release,
+       .read                   = perf_read,
+       .poll                   = perf_poll,
+};
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+{
+       struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+
+       if (!counter)
+               return NULL;
+
+       mutex_init(&counter->mutex);
+       INIT_LIST_HEAD(&counter->list);
+       init_waitqueue_head(&counter->waitq);
+
+       counter->irqdata        = &counter->data[0];
+       counter->usrdata        = &counter->data[1];
+       counter->cpu            = cpu;
+       counter->record_type    = record_type;
+       counter->__irq_period   = hw_event_period;
+       counter->wakeup_pending = 0;
+
+       return counter;
+}
+
+/**
+ * sys_perf_task_open - open a performance counter associate it to a task
+ * @hw_event_type:     event type for monitoring/sampling...
+ * @pid:               target pid
+ */
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+                     u32 hw_event_period,
+                     u32 record_type,
+                     pid_t pid,
+                     int cpu)
+{
+       struct perf_counter_context *ctx;
+       struct perf_counter *counter;
+       int ret;
+
+       ctx = find_get_context(pid, cpu);
+       if (IS_ERR(ctx))
+               return PTR_ERR(ctx);
+
+       ret = -ENOMEM;
+       counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+       if (!counter)
+               goto err_put_context;
+
+       ret = hw_perf_counter_init(counter, hw_event_type);
+       if (ret)
+               goto err_free_put_context;
+
+       perf_install_in_context(ctx, counter, cpu);
+
+       ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+       if (ret < 0)
+               goto err_remove_free_put_context;
+
+       return ret;
+
+err_remove_free_put_context:
+       mutex_lock(&counter->mutex);
+       perf_remove_from_context(counter);
+       mutex_unlock(&counter->mutex);
+
+err_free_put_context:
+       kfree(counter);
+
+err_put_context:
+       put_context(ctx);
+
+       return ret;
+}
+
+static void __cpuinit perf_init_cpu(int cpu)
+{
+       struct perf_cpu_context *ctx;
+
+       ctx = &per_cpu(perf_cpu_context, cpu);
+       spin_lock_init(&ctx->ctx.lock);
+       INIT_LIST_HEAD(&ctx->ctx.counters);
+
+       mutex_lock(&perf_resource_mutex);
+       ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+       mutex_unlock(&perf_resource_mutex);
+       hw_perf_counter_setup();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_exit_cpu(void *info)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+       struct perf_counter *counter, *tmp;
+
+       list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
+               __perf_remove_from_context(counter);
+
+}
+static void perf_exit_cpu(int cpu)
+{
+       smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+}
+#else
+static inline void perf_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       switch (action) {
+
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               perf_init_cpu(cpu);
+               break;
+
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               perf_exit_cpu(cpu);
+               break;
+
+       default:
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+       .notifier_call          = perf_cpu_notify,
+};
+
+static int __init perf_counter_init(void)
+{
+       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+                       (void *)(long)smp_processor_id());
+       register_cpu_notifier(&perf_cpu_nb);
+
+       return 0;
+}
+early_initcall(perf_counter_init);
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+                       const char *buf,
+                       size_t count)
+{
+       struct perf_cpu_context *cpuctx;
+       unsigned long val;
+       int err, cpu, mpt;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > perf_max_counters)
+               return -EINVAL;
+
+       mutex_lock(&perf_resource_mutex);
+       perf_reserved_percpu = val;
+       for_each_online_cpu(cpu) {
+               cpuctx = &per_cpu(perf_cpu_context, cpu);
+               spin_lock_irq(&cpuctx->ctx.lock);
+               mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+                         perf_max_counters - perf_reserved_percpu);
+               cpuctx->max_pertask = mpt;
+               spin_unlock_irq(&cpuctx->ctx.lock);
+       }
+       mutex_unlock(&perf_resource_mutex);
+
+       return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+       return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+       unsigned long val;
+       int err;
+
+       err = strict_strtoul(buf, 10, &val);
+       if (err)
+               return err;
+       if (val > 1)
+               return -EINVAL;
+
+       mutex_lock(&perf_resource_mutex);
+       perf_overcommit = val;
+       mutex_unlock(&perf_resource_mutex);
+
+       return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+                               reserve_percpu,
+                               0644,
+                               perf_show_reserve_percpu,
+                               perf_set_reserve_percpu
+                       );
+
+static SYSDEV_CLASS_ATTR(
+                               overcommit,
+                               0644,
+                               perf_show_overcommit,
+                               perf_set_overcommit
+                       );
+
+static struct attribute *perfclass_attrs[] = {
+       &attr_reserve_percpu.attr,
+       &attr_overcommit.attr,
+       NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+       .attrs                  = perfclass_attrs,
+       .name                   = "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+                                 &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
+
index b7480fb..254d56d 100644 (file)
@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        fire_sched_out_preempt_notifiers(prev, next);
+       perf_counter_task_sched_out(prev, cpu_of(rq));
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
 }
@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (current->sched_class->post_schedule)
@@ -4296,6 +4319,7 @@ void scheduler_tick(void)
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
+       perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
index e14a232..4be8bbc 100644 (file)
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);