* See also PERF_RECORD_MISC_EXACT_IP
*/
precise_ip : 2, /* skid constraint */
+ mmap_data : 1, /* non-exec mmap data */
+ sample_id_all : 1, /* sample_type all events */
- __reserved_1 : 47;
+ __reserved_1 : 45;
union {
__u32 wakeup_events; /* wakeup every n events */
enum perf_event_type {
/*
+ * If perf_event_attr.sample_id_all is set then all event types will
+ * have the sample_type selected fields related to where/when
+ * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
+ * described in PERF_RECORD_SAMPLE below, it will be stashed just after
+ * the perf_event_header and the fields already present for the existing
+ * fields, i.e. at the end of the payload. That way a newer perf.data
+ * file will be supported by older perf tools, with these new optional
+ * fields being ignored.
+ *
* The MMAP events record the PROT_EXEC mappings so that we can
* correlate userspace IPs to code. They have the following structure:
*
#define PERF_FLAG_FD_NO_GROUP (1U << 0)
#define PERF_FLAG_FD_OUTPUT (1U << 1)
+#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */
#ifdef __KERNEL__
/*
*/
#ifdef CONFIG_PERF_EVENTS
+# include <linux/cgroup.h>
# include <asm/perf_event.h>
+# include <asm/local64.h>
#endif
struct perf_guest_info_callbacks {
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
+#include <linux/irq_work.h>
+#include <linux/jump_label_ref.h>
#include <asm/atomic.h>
+#include <asm/local.h>
#define PERF_MAX_STACK_DEPTH 255
int last_cpu;
};
struct { /* software */
- s64 remaining;
struct hrtimer hrtimer;
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
- /* breakpoint */
- struct arch_hw_breakpoint info;
+ struct { /* breakpoint */
+ struct arch_hw_breakpoint info;
+ struct list_head bp_list;
+ /*
+ * Crufty hack to avoid the chicken and egg
+ * problem hw_breakpoint has with context
+ * creation and event initalization.
+ */
+ struct task_struct *bp_target;
+ };
#endif
};
- atomic64_t prev_count;
+ int state;
+ local64_t prev_count;
u64 sample_period;
u64 last_period;
- atomic64_t period_left;
+ local64_t period_left;
u64 interrupts;
u64 freq_time_stamp;
#endif
};
+/*
+ * hw_perf_event::state flags
+ */
+#define PERF_HES_STOPPED 0x01 /* the counter is stopped */
+#define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */
+#define PERF_HES_ARCH 0x04
+
struct perf_event;
+/*
+ * Common implementation detail of pmu::{start,commit,cancel}_txn
+ */
+#define PERF_EVENT_TXN 0x1
+
/**
* struct pmu - generic performance monitoring unit
*/
struct pmu {
- int (*enable) (struct perf_event *event);
- void (*disable) (struct perf_event *event);
- int (*start) (struct perf_event *event);
- void (*stop) (struct perf_event *event);
+ struct list_head entry;
+
+ struct device *dev;
+ char *name;
+ int type;
+
+ int * __percpu pmu_disable_count;
+ struct perf_cpu_context * __percpu pmu_cpu_context;
+ int task_ctx_nr;
+
+ /*
+ * Fully disable/enable this PMU, can be used to protect from the PMI
+ * as well as for lazy/batch writing of the MSRs.
+ */
+ void (*pmu_enable) (struct pmu *pmu); /* optional */
+ void (*pmu_disable) (struct pmu *pmu); /* optional */
+
+ /*
+ * Try and initialize the event for this PMU.
+ * Should return -ENOENT when the @event doesn't match this PMU.
+ */
+ int (*event_init) (struct perf_event *event);
+
+#define PERF_EF_START 0x01 /* start the counter when adding */
+#define PERF_EF_RELOAD 0x02 /* reload the counter when starting */
+#define PERF_EF_UPDATE 0x04 /* update the counter when stopping */
+
+ /*
+ * Adds/Removes a counter to/from the PMU, can be done inside
+ * a transaction, see the ->*_txn() methods.
+ */
+ int (*add) (struct perf_event *event, int flags);
+ void (*del) (struct perf_event *event, int flags);
+
+ /*
+ * Starts/Stops a counter present on the PMU. The PMI handler
+ * should stop the counter when perf_event_overflow() returns
+ * !0. ->start() will be used to continue.
+ */
+ void (*start) (struct perf_event *event, int flags);
+ void (*stop) (struct perf_event *event, int flags);
+
+ /*
+ * Updates the counter value of the event.
+ */
void (*read) (struct perf_event *event);
- void (*unthrottle) (struct perf_event *event);
+
+ /*
+ * Group events scheduling is treated as a transaction, add
+ * group events as a whole and perform one schedulability test.
+ * If the test fails, roll back the whole group
+ *
+ * Start the transaction, after this ->add() doesn't need to
+ * do schedulability tests.
+ */
+ void (*start_txn) (struct pmu *pmu); /* optional */
+ /*
+ * If ->start_txn() disabled the ->add() schedulability test
+ * then ->commit_txn() is required to perform one. On success
+ * the transaction is closed. On error the transaction is kept
+ * open until ->cancel_txn() is called.
+ */
+ int (*commit_txn) (struct pmu *pmu); /* optional */
+ /*
+ * Will cancel the transaction, assumes ->del() is called
+ * for each successfull ->add() during the transaction.
+ */
+ void (*cancel_txn) (struct pmu *pmu); /* optional */
};
/**
* enum perf_event_active_state - the states of a event
*/
enum perf_event_active_state {
- PERF_EVENT_STATE_FREE = -3,
PERF_EVENT_STATE_ERROR = -2,
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
struct file;
-struct perf_mmap_data {
+#define PERF_BUFFER_WRITABLE 0x01
+
+struct perf_buffer {
+ atomic_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
+ int page_order; /* allocation order */
#endif
- int data_order;
int nr_pages; /* nr of data pages */
int writable; /* are we writable */
- int nr_locked; /* nr pages mlocked */
atomic_t poll; /* POLL_ for wakeups */
- atomic_t events; /* event_id limit */
-
- atomic_long_t head; /* write position */
- atomic_long_t done_head; /* completed head */
- atomic_t lock; /* concurrent writes */
- atomic_t wakeup; /* needs a wakeup */
- atomic_t lost; /* nr records lost */
+ local_t head; /* write position */
+ local_t nest; /* nested writers */
+ local_t events; /* event limit */
+ local_t wakeup; /* wakeup stamp */
+ local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
void *data_pages[0];
};
-struct perf_pending_entry {
- struct perf_pending_entry *next;
- void (*func)(struct perf_pending_entry *);
-};
-
struct perf_sample_data;
typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
struct rcu_head rcu_head;
};
+#define PERF_ATTACH_CONTEXT 0x01
+#define PERF_ATTACH_GROUP 0x02
+#define PERF_ATTACH_TASK 0x04
+
+#ifdef CONFIG_CGROUP_PERF
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info *info; /* timing info, one per cpu */
+};
+#endif
+
/**
* struct perf_event - performance event kernel representation:
*/
int nr_siblings;
int group_flags;
struct perf_event *group_leader;
- struct perf_event *output;
- const struct pmu *pmu;
+ struct pmu *pmu;
enum perf_event_active_state state;
- atomic64_t count;
+ unsigned int attach_state;
+ local64_t count;
+ atomic64_t child_count;
/*
* These are the total time in nanoseconds that the event
u64 tstamp_running;
u64 tstamp_stopped;
+ /*
+ * timestamp shadows the actual context timing but it can
+ * be safely used in NMI interrupt context. It reflects the
+ * context time as it was when the event was last scheduled in.
+ *
+ * ctx_time already accounts for ctx->timestamp. Therefore to
+ * compute ctx_time for a sample, simply add perf_clock().
+ */
+ u64 shadow_ctx_time;
+
struct perf_event_attr attr;
+ u16 header_size;
+ u16 id_header_size;
+ u16 read_size;
struct hw_perf_event hw;
struct perf_event_context *ctx;
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
- struct perf_mmap_data *data;
+ int mmap_locked;
+ struct user_struct *mmap_user;
+ struct perf_buffer *buffer;
/* poll related */
wait_queue_head_t waitq;
int pending_wakeup;
int pending_kill;
int pending_disable;
- struct perf_pending_entry pending;
+ struct irq_work pending;
atomic_t event_limit;
perf_overflow_handler_t overflow_handler;
#ifdef CONFIG_EVENT_TRACING
+ struct ftrace_event_call *tp_event;
struct event_filter *filter;
#endif
+#ifdef CONFIG_CGROUP_PERF
+ struct perf_cgroup *cgrp; /* cgroup event is attach to */
+ int cgrp_defer_enabled;
+#endif
+
#endif /* CONFIG_PERF_EVENTS */
};
+enum perf_event_context_type {
+ task_context,
+ cpu_context,
+};
+
/**
* struct perf_event_context - event context structure
*
* Used as a container for task events and CPU events as well:
*/
struct perf_event_context {
+ enum perf_event_context_type type;
+ struct pmu *pmu;
/*
* Protect the states of the events in the list,
* nr_active, and the list:
int nr_active;
int is_active;
int nr_stat;
+ int rotate_disable;
atomic_t refcount;
struct task_struct *task;
u64 generation;
int pin_count;
struct rcu_head rcu_head;
+ int nr_cgroups; /* cgroup events present */
};
+/*
+ * Number of contexts where an event can trigger:
+ * task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS 4
+
/**
* struct perf_event_cpu_context - per cpu event context structure
*/
struct perf_event_context ctx;
struct perf_event_context *task_ctx;
int active_oncpu;
- int max_pertask;
int exclusive;
- struct swevent_hlist *swevent_hlist;
- struct mutex hlist_mutex;
- int hlist_refcount;
-
- /*
- * Recursion avoidance:
- *
- * task, softirq, irq, nmi context
- */
- int recursion[4];
+ struct list_head rotation_list;
+ int jiffies_interval;
+ struct pmu *active_pmu;
+#ifdef CONFIG_CGROUP_PERF
+ struct perf_cgroup *cgrp;
+#endif
};
struct perf_output_handle {
struct perf_event *event;
- struct perf_mmap_data *data;
- unsigned long head;
- unsigned long offset;
+ struct perf_buffer *buffer;
+ unsigned long wakeup;
+ unsigned long size;
+ void *addr;
+ int page;
int nmi;
int sample;
- int locked;
};
#ifdef CONFIG_PERF_EVENTS
-/*
- * Set by architecture code:
- */
-extern int perf_max_events;
-
-extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
+extern void perf_pmu_unregister(struct pmu *pmu);
-extern void perf_event_task_sched_in(struct task_struct *task);
-extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-extern void perf_event_task_tick(struct task_struct *task);
+extern int perf_num_counters(void);
+extern const char *perf_pmu_name(void);
+extern void __perf_event_task_sched_in(struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
+extern void perf_event_delayed_put(struct task_struct *task);
extern void perf_event_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
+extern void perf_pmu_disable(struct pmu *pmu);
+extern void perf_pmu_enable(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_event *group_leader,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
int cpu,
- pid_t pid,
+ struct task_struct *task,
perf_overflow_handler_t callback);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
struct perf_sample_data *data,
struct pt_regs *regs);
+static inline bool is_sampling_event(struct perf_event *event)
+{
+ return event->attr.sample_period != 0;
+}
+
/*
* Return 1 for a software event, 0 for a hardware event
*/
static inline int is_software_event(struct perf_event *event)
{
- switch (event->attr.type) {
- case PERF_TYPE_SOFTWARE:
- case PERF_TYPE_TRACEPOINT:
- /* for now the breakpoint stuff also works as software event */
- case PERF_TYPE_BREAKPOINT:
- return 1;
- }
- return 0;
+ return event->pmu->task_ctx_nr == perf_sw_context;
}
extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
-extern void
-perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip);
+#ifndef perf_arch_fetch_caller_regs
+static inline void
+perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
+#endif
/*
* Take a snapshot of the regs. Skip ip and frame pointer to
* - bp for callchains
* - eflags, for future purposes, just in case
*/
-static inline void perf_fetch_caller_regs(struct pt_regs *regs, int skip)
+static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
- unsigned long ip;
-
memset(regs, 0, sizeof(*regs));
- switch (skip) {
- case 1 :
- ip = CALLER_ADDR0;
- break;
- case 2 :
- ip = CALLER_ADDR1;
- break;
- case 3 :
- ip = CALLER_ADDR2;
- break;
- case 4:
- ip = CALLER_ADDR3;
- break;
- /* No need to support further for now */
- default:
- ip = 0;
- }
-
- return perf_arch_fetch_caller_regs(regs, ip, skip);
+ perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}
-static inline void
+static __always_inline void
perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
{
- if (atomic_read(&perf_swevent_enabled[event_id])) {
- struct pt_regs hot_regs;
-
- if (!regs) {
- perf_fetch_caller_regs(&hot_regs, 1);
- regs = &hot_regs;
- }
- __perf_sw_event(event_id, nr, nmi, regs, addr);
+ struct pt_regs hot_regs;
+
+ JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
+ return;
+
+have_event:
+ if (!regs) {
+ perf_fetch_caller_regs(&hot_regs);
+ regs = &hot_regs;
}
+ __perf_sw_event(event_id, nr, nmi, regs, addr);
}
-extern void __perf_event_mmap(struct vm_area_struct *vma);
+extern atomic_t perf_sched_events;
+
+static inline void perf_event_task_sched_in(struct task_struct *task)
+{
+ COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
+}
-static inline void perf_event_mmap(struct vm_area_struct *vma)
+static inline
+void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
{
- if (vma->vm_flags & VM_EXEC)
- __perf_event_mmap(vma);
+ perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+ COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
}
+extern void perf_event_mmap(struct vm_area_struct *vma);
extern struct perf_guest_info_callbacks *perf_guest_cbs;
extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
extern void perf_event_comm(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+/* Callchains */
+DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
+
+extern void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs);
+
+
+static inline void
+perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+ if (entry->nr < PERF_MAX_STACK_DEPTH)
+ entry->ip[entry->nr++] = ip;
+}
extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
}
extern void perf_event_init(void);
-extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- int entry_size, struct pt_regs *regs);
+extern void perf_tp_event(u64 addr, u64 count, void *record,
+ int entry_size, struct pt_regs *regs,
+ struct hlist_head *head, int rctx);
extern void perf_bp_event(struct perf_event *event, void *data);
#ifndef perf_misc_flags
extern void perf_swevent_put_recursion_context(int rctx);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
+extern void perf_event_task_tick(void);
#else
static inline void
perf_event_task_sched_in(struct task_struct *task) { }
static inline void
perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next) { }
-static inline void
-perf_event_task_tick(struct task_struct *task) { }
static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
-static inline void perf_event_do_pending(void) { }
+static inline void perf_event_delayed_put(struct task_struct *task) { }
static inline void perf_event_print_debug(void) { }
-static inline void perf_disable(void) { }
-static inline void perf_enable(void) { }
static inline int perf_event_task_disable(void) { return -EINVAL; }
static inline int perf_event_task_enable(void) { return -EINVAL; }
static inline void perf_swevent_put_recursion_context(int rctx) { }
static inline void perf_event_enable(struct perf_event *event) { }
static inline void perf_event_disable(struct perf_event *event) { }
+static inline void perf_event_task_tick(void) { }
#endif
#define perf_output_put(handle, x) \
#define perf_cpu_notifier(fn) \
do { \
static struct notifier_block fn##_nb __cpuinitdata = \
- { .notifier_call = fn, .priority = 20 }; \
+ { .notifier_call = fn, .priority = CPU_PRI_PERF }; \
fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \
(void *)(unsigned long)smp_processor_id()); \
fn(&fn##_nb, (unsigned long)CPU_STARTING, \