Merge branch 'bugfixes' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6
[linux-2.6.git] / include / linux / perf_event.h
1 /*
2  * Performance events:
3  *
4  *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
5  *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
6  *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
7  *
8  * Data type definitions, declarations, prototypes.
9  *
10  *    Started by: Thomas Gleixner and Ingo Molnar
11  *
12  * For licencing details see kernel-base/COPYING
13  */
14 #ifndef _LINUX_PERF_EVENT_H
15 #define _LINUX_PERF_EVENT_H
16
17 #include <linux/types.h>
18 #include <linux/ioctl.h>
19 #include <asm/byteorder.h>
20
21 /*
22  * User-space ABI bits:
23  */
24
25 /*
26  * attr.type
27  */
28 enum perf_type_id {
29         PERF_TYPE_HARDWARE                      = 0,
30         PERF_TYPE_SOFTWARE                      = 1,
31         PERF_TYPE_TRACEPOINT                    = 2,
32         PERF_TYPE_HW_CACHE                      = 3,
33         PERF_TYPE_RAW                           = 4,
34         PERF_TYPE_BREAKPOINT                    = 5,
35
36         PERF_TYPE_MAX,                          /* non-ABI */
37 };
38
39 /*
40  * Generalized performance event event_id types, used by the
41  * attr.event_id parameter of the sys_perf_event_open()
42  * syscall:
43  */
44 enum perf_hw_id {
45         /*
46          * Common hardware events, generalized by the kernel:
47          */
48         PERF_COUNT_HW_CPU_CYCLES                = 0,
49         PERF_COUNT_HW_INSTRUCTIONS              = 1,
50         PERF_COUNT_HW_CACHE_REFERENCES          = 2,
51         PERF_COUNT_HW_CACHE_MISSES              = 3,
52         PERF_COUNT_HW_BRANCH_INSTRUCTIONS       = 4,
53         PERF_COUNT_HW_BRANCH_MISSES             = 5,
54         PERF_COUNT_HW_BUS_CYCLES                = 6,
55
56         PERF_COUNT_HW_MAX,                      /* non-ABI */
57 };
58
59 /*
60  * Generalized hardware cache events:
61  *
62  *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
63  *       { read, write, prefetch } x
64  *       { accesses, misses }
65  */
66 enum perf_hw_cache_id {
67         PERF_COUNT_HW_CACHE_L1D                 = 0,
68         PERF_COUNT_HW_CACHE_L1I                 = 1,
69         PERF_COUNT_HW_CACHE_LL                  = 2,
70         PERF_COUNT_HW_CACHE_DTLB                = 3,
71         PERF_COUNT_HW_CACHE_ITLB                = 4,
72         PERF_COUNT_HW_CACHE_BPU                 = 5,
73
74         PERF_COUNT_HW_CACHE_MAX,                /* non-ABI */
75 };
76
77 enum perf_hw_cache_op_id {
78         PERF_COUNT_HW_CACHE_OP_READ             = 0,
79         PERF_COUNT_HW_CACHE_OP_WRITE            = 1,
80         PERF_COUNT_HW_CACHE_OP_PREFETCH         = 2,
81
82         PERF_COUNT_HW_CACHE_OP_MAX,             /* non-ABI */
83 };
84
85 enum perf_hw_cache_op_result_id {
86         PERF_COUNT_HW_CACHE_RESULT_ACCESS       = 0,
87         PERF_COUNT_HW_CACHE_RESULT_MISS         = 1,
88
89         PERF_COUNT_HW_CACHE_RESULT_MAX,         /* non-ABI */
90 };
91
92 /*
93  * Special "software" events provided by the kernel, even if the hardware
94  * does not support performance events. These events measure various
95  * physical and sw events of the kernel (and allow the profiling of them as
96  * well):
97  */
98 enum perf_sw_ids {
99         PERF_COUNT_SW_CPU_CLOCK                 = 0,
100         PERF_COUNT_SW_TASK_CLOCK                = 1,
101         PERF_COUNT_SW_PAGE_FAULTS               = 2,
102         PERF_COUNT_SW_CONTEXT_SWITCHES          = 3,
103         PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
104         PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
105         PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
106         PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
107         PERF_COUNT_SW_EMULATION_FAULTS          = 8,
108
109         PERF_COUNT_SW_MAX,                      /* non-ABI */
110 };
111
112 /*
113  * Bits that can be set in attr.sample_type to request information
114  * in the overflow packets.
115  */
116 enum perf_event_sample_format {
117         PERF_SAMPLE_IP                          = 1U << 0,
118         PERF_SAMPLE_TID                         = 1U << 1,
119         PERF_SAMPLE_TIME                        = 1U << 2,
120         PERF_SAMPLE_ADDR                        = 1U << 3,
121         PERF_SAMPLE_READ                        = 1U << 4,
122         PERF_SAMPLE_CALLCHAIN                   = 1U << 5,
123         PERF_SAMPLE_ID                          = 1U << 6,
124         PERF_SAMPLE_CPU                         = 1U << 7,
125         PERF_SAMPLE_PERIOD                      = 1U << 8,
126         PERF_SAMPLE_STREAM_ID                   = 1U << 9,
127         PERF_SAMPLE_RAW                         = 1U << 10,
128
129         PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
130 };
131
132 /*
133  * The format of the data returned by read() on a perf event fd,
134  * as specified by attr.read_format:
135  *
136  * struct read_format {
137  *      { u64           value;
138  *        { u64         time_enabled; } && PERF_FORMAT_ENABLED
139  *        { u64         time_running; } && PERF_FORMAT_RUNNING
140  *        { u64         id;           } && PERF_FORMAT_ID
141  *      } && !PERF_FORMAT_GROUP
142  *
143  *      { u64           nr;
144  *        { u64         time_enabled; } && PERF_FORMAT_ENABLED
145  *        { u64         time_running; } && PERF_FORMAT_RUNNING
146  *        { u64         value;
147  *          { u64       id;           } && PERF_FORMAT_ID
148  *        }             cntr[nr];
149  *      } && PERF_FORMAT_GROUP
150  * };
151  */
152 enum perf_event_read_format {
153         PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
154         PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,
155         PERF_FORMAT_ID                          = 1U << 2,
156         PERF_FORMAT_GROUP                       = 1U << 3,
157
158         PERF_FORMAT_MAX = 1U << 4,              /* non-ABI */
159 };
160
161 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
162
163 /*
164  * Hardware event_id to monitor via a performance monitoring event:
165  */
166 struct perf_event_attr {
167
168         /*
169          * Major type: hardware/software/tracepoint/etc.
170          */
171         __u32                   type;
172
173         /*
174          * Size of the attr structure, for fwd/bwd compat.
175          */
176         __u32                   size;
177
178         /*
179          * Type specific configuration information.
180          */
181         __u64                   config;
182
183         union {
184                 __u64           sample_period;
185                 __u64           sample_freq;
186         };
187
188         __u64                   sample_type;
189         __u64                   read_format;
190
191         __u64                   disabled       :  1, /* off by default        */
192                                 inherit        :  1, /* children inherit it   */
193                                 pinned         :  1, /* must always be on PMU */
194                                 exclusive      :  1, /* only group on PMU     */
195                                 exclude_user   :  1, /* don't count user      */
196                                 exclude_kernel :  1, /* ditto kernel          */
197                                 exclude_hv     :  1, /* ditto hypervisor      */
198                                 exclude_idle   :  1, /* don't count when idle */
199                                 mmap           :  1, /* include mmap data     */
200                                 comm           :  1, /* include comm data     */
201                                 freq           :  1, /* use freq, not period  */
202                                 inherit_stat   :  1, /* per task counts       */
203                                 enable_on_exec :  1, /* next exec enables     */
204                                 task           :  1, /* trace fork/exit       */
205                                 watermark      :  1, /* wakeup_watermark      */
206                                 /*
207                                  * precise_ip:
208                                  *
209                                  *  0 - SAMPLE_IP can have arbitrary skid
210                                  *  1 - SAMPLE_IP must have constant skid
211                                  *  2 - SAMPLE_IP requested to have 0 skid
212                                  *  3 - SAMPLE_IP must have 0 skid
213                                  *
214                                  *  See also PERF_RECORD_MISC_EXACT_IP
215                                  */
216                                 precise_ip     :  2, /* skid constraint       */
217                                 mmap_data      :  1, /* non-exec mmap data    */
218                                 sample_id_all  :  1, /* sample_type all events */
219
220                                 __reserved_1   : 45;
221
222         union {
223                 __u32           wakeup_events;    /* wakeup every n events */
224                 __u32           wakeup_watermark; /* bytes before wakeup   */
225         };
226
227         __u32                   bp_type;
228         union {
229                 __u64           bp_addr;
230                 __u64           config1; /* extension of config */
231         };
232         union {
233                 __u64           bp_len;
234                 __u64           config2; /* extension of config1 */
235         };
236 };
237
238 /*
239  * Ioctls that can be done on a perf event fd:
240  */
241 #define PERF_EVENT_IOC_ENABLE           _IO ('$', 0)
242 #define PERF_EVENT_IOC_DISABLE          _IO ('$', 1)
243 #define PERF_EVENT_IOC_REFRESH          _IO ('$', 2)
244 #define PERF_EVENT_IOC_RESET            _IO ('$', 3)
245 #define PERF_EVENT_IOC_PERIOD           _IOW('$', 4, __u64)
246 #define PERF_EVENT_IOC_SET_OUTPUT       _IO ('$', 5)
247 #define PERF_EVENT_IOC_SET_FILTER       _IOW('$', 6, char *)
248
249 enum perf_event_ioc_flags {
250         PERF_IOC_FLAG_GROUP             = 1U << 0,
251 };
252
253 /*
254  * Structure of the page that can be mapped via mmap
255  */
256 struct perf_event_mmap_page {
257         __u32   version;                /* version number of this structure */
258         __u32   compat_version;         /* lowest version this is compat with */
259
260         /*
261          * Bits needed to read the hw events in user-space.
262          *
263          *   u32 seq;
264          *   s64 count;
265          *
266          *   do {
267          *     seq = pc->lock;
268          *
269          *     barrier()
270          *     if (pc->index) {
271          *       count = pmc_read(pc->index - 1);
272          *       count += pc->offset;
273          *     } else
274          *       goto regular_read;
275          *
276          *     barrier();
277          *   } while (pc->lock != seq);
278          *
279          * NOTE: for obvious reason this only works on self-monitoring
280          *       processes.
281          */
282         __u32   lock;                   /* seqlock for synchronization */
283         __u32   index;                  /* hardware event identifier */
284         __s64   offset;                 /* add to hardware event value */
285         __u64   time_enabled;           /* time event active */
286         __u64   time_running;           /* time event on cpu */
287
288                 /*
289                  * Hole for extension of the self monitor capabilities
290                  */
291
292         __u64   __reserved[123];        /* align to 1k */
293
294         /*
295          * Control data for the mmap() data buffer.
296          *
297          * User-space reading the @data_head value should issue an rmb(), on
298          * SMP capable platforms, after reading this value -- see
299          * perf_event_wakeup().
300          *
301          * When the mapping is PROT_WRITE the @data_tail value should be
302          * written by userspace to reflect the last read data. In this case
303          * the kernel will not over-write unread data.
304          */
305         __u64   data_head;              /* head in the data section */
306         __u64   data_tail;              /* user-space written tail */
307 };
308
309 #define PERF_RECORD_MISC_CPUMODE_MASK           (7 << 0)
310 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN        (0 << 0)
311 #define PERF_RECORD_MISC_KERNEL                 (1 << 0)
312 #define PERF_RECORD_MISC_USER                   (2 << 0)
313 #define PERF_RECORD_MISC_HYPERVISOR             (3 << 0)
314 #define PERF_RECORD_MISC_GUEST_KERNEL           (4 << 0)
315 #define PERF_RECORD_MISC_GUEST_USER             (5 << 0)
316
317 /*
318  * Indicates that the content of PERF_SAMPLE_IP points to
319  * the actual instruction that triggered the event. See also
320  * perf_event_attr::precise_ip.
321  */
322 #define PERF_RECORD_MISC_EXACT_IP               (1 << 14)
323 /*
324  * Reserve the last bit to indicate some extended misc field
325  */
326 #define PERF_RECORD_MISC_EXT_RESERVED           (1 << 15)
327
328 struct perf_event_header {
329         __u32   type;
330         __u16   misc;
331         __u16   size;
332 };
333
334 enum perf_event_type {
335
336         /*
337          * If perf_event_attr.sample_id_all is set then all event types will
338          * have the sample_type selected fields related to where/when
339          * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
340          * described in PERF_RECORD_SAMPLE below, it will be stashed just after
341          * the perf_event_header and the fields already present for the existing
342          * fields, i.e. at the end of the payload. That way a newer perf.data
343          * file will be supported by older perf tools, with these new optional
344          * fields being ignored.
345          *
346          * The MMAP events record the PROT_EXEC mappings so that we can
347          * correlate userspace IPs to code. They have the following structure:
348          *
349          * struct {
350          *      struct perf_event_header        header;
351          *
352          *      u32                             pid, tid;
353          *      u64                             addr;
354          *      u64                             len;
355          *      u64                             pgoff;
356          *      char                            filename[];
357          * };
358          */
359         PERF_RECORD_MMAP                        = 1,
360
361         /*
362          * struct {
363          *      struct perf_event_header        header;
364          *      u64                             id;
365          *      u64                             lost;
366          * };
367          */
368         PERF_RECORD_LOST                        = 2,
369
370         /*
371          * struct {
372          *      struct perf_event_header        header;
373          *
374          *      u32                             pid, tid;
375          *      char                            comm[];
376          * };
377          */
378         PERF_RECORD_COMM                        = 3,
379
380         /*
381          * struct {
382          *      struct perf_event_header        header;
383          *      u32                             pid, ppid;
384          *      u32                             tid, ptid;
385          *      u64                             time;
386          * };
387          */
388         PERF_RECORD_EXIT                        = 4,
389
390         /*
391          * struct {
392          *      struct perf_event_header        header;
393          *      u64                             time;
394          *      u64                             id;
395          *      u64                             stream_id;
396          * };
397          */
398         PERF_RECORD_THROTTLE                    = 5,
399         PERF_RECORD_UNTHROTTLE                  = 6,
400
401         /*
402          * struct {
403          *      struct perf_event_header        header;
404          *      u32                             pid, ppid;
405          *      u32                             tid, ptid;
406          *      u64                             time;
407          * };
408          */
409         PERF_RECORD_FORK                        = 7,
410
411         /*
412          * struct {
413          *      struct perf_event_header        header;
414          *      u32                             pid, tid;
415          *
416          *      struct read_format              values;
417          * };
418          */
419         PERF_RECORD_READ                        = 8,
420
421         /*
422          * struct {
423          *      struct perf_event_header        header;
424          *
425          *      { u64                   ip;       } && PERF_SAMPLE_IP
426          *      { u32                   pid, tid; } && PERF_SAMPLE_TID
427          *      { u64                   time;     } && PERF_SAMPLE_TIME
428          *      { u64                   addr;     } && PERF_SAMPLE_ADDR
429          *      { u64                   id;       } && PERF_SAMPLE_ID
430          *      { u64                   stream_id;} && PERF_SAMPLE_STREAM_ID
431          *      { u32                   cpu, res; } && PERF_SAMPLE_CPU
432          *      { u64                   period;   } && PERF_SAMPLE_PERIOD
433          *
434          *      { struct read_format    values;   } && PERF_SAMPLE_READ
435          *
436          *      { u64                   nr,
437          *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN
438          *
439          *      #
440          *      # The RAW record below is opaque data wrt the ABI
441          *      #
442          *      # That is, the ABI doesn't make any promises wrt to
443          *      # the stability of its content, it may vary depending
444          *      # on event, hardware, kernel version and phase of
445          *      # the moon.
446          *      #
447          *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
448          *      #
449          *
450          *      { u32                   size;
451          *        char                  data[size];}&& PERF_SAMPLE_RAW
452          * };
453          */
454         PERF_RECORD_SAMPLE                      = 9,
455
456         PERF_RECORD_MAX,                        /* non-ABI */
457 };
458
459 enum perf_callchain_context {
460         PERF_CONTEXT_HV                 = (__u64)-32,
461         PERF_CONTEXT_KERNEL             = (__u64)-128,
462         PERF_CONTEXT_USER               = (__u64)-512,
463
464         PERF_CONTEXT_GUEST              = (__u64)-2048,
465         PERF_CONTEXT_GUEST_KERNEL       = (__u64)-2176,
466         PERF_CONTEXT_GUEST_USER         = (__u64)-2560,
467
468         PERF_CONTEXT_MAX                = (__u64)-4095,
469 };
470
471 #define PERF_FLAG_FD_NO_GROUP   (1U << 0)
472 #define PERF_FLAG_FD_OUTPUT     (1U << 1)
473 #define PERF_FLAG_PID_CGROUP    (1U << 2) /* pid=cgroup id, per-cpu mode only */
474
475 #ifdef __KERNEL__
476 /*
477  * Kernel-internal data types and definitions:
478  */
479
480 #ifdef CONFIG_PERF_EVENTS
481 # include <linux/cgroup.h>
482 # include <asm/perf_event.h>
483 # include <asm/local64.h>
484 #endif
485
486 struct perf_guest_info_callbacks {
487         int (*is_in_guest) (void);
488         int (*is_user_mode) (void);
489         unsigned long (*get_guest_ip) (void);
490 };
491
492 #ifdef CONFIG_HAVE_HW_BREAKPOINT
493 #include <asm/hw_breakpoint.h>
494 #endif
495
496 #include <linux/list.h>
497 #include <linux/mutex.h>
498 #include <linux/rculist.h>
499 #include <linux/rcupdate.h>
500 #include <linux/spinlock.h>
501 #include <linux/hrtimer.h>
502 #include <linux/fs.h>
503 #include <linux/pid_namespace.h>
504 #include <linux/workqueue.h>
505 #include <linux/ftrace.h>
506 #include <linux/cpu.h>
507 #include <linux/irq_work.h>
508 #include <linux/jump_label_ref.h>
509 #include <asm/atomic.h>
510 #include <asm/local.h>
511
512 #define PERF_MAX_STACK_DEPTH            255
513
514 struct perf_callchain_entry {
515         __u64                           nr;
516         __u64                           ip[PERF_MAX_STACK_DEPTH];
517 };
518
519 struct perf_raw_record {
520         u32                             size;
521         void                            *data;
522 };
523
524 struct perf_branch_entry {
525         __u64                           from;
526         __u64                           to;
527         __u64                           flags;
528 };
529
530 struct perf_branch_stack {
531         __u64                           nr;
532         struct perf_branch_entry        entries[0];
533 };
534
535 struct task_struct;
536
537 /**
538  * struct hw_perf_event - performance event hardware details:
539  */
540 struct hw_perf_event {
541 #ifdef CONFIG_PERF_EVENTS
542         union {
543                 struct { /* hardware */
544                         u64             config;
545                         u64             last_tag;
546                         unsigned long   config_base;
547                         unsigned long   event_base;
548                         int             idx;
549                         int             last_cpu;
550                         unsigned int    extra_reg;
551                         u64             extra_config;
552                         int             extra_alloc;
553                 };
554                 struct { /* software */
555                         struct hrtimer  hrtimer;
556                 };
557 #ifdef CONFIG_HAVE_HW_BREAKPOINT
558                 struct { /* breakpoint */
559                         struct arch_hw_breakpoint       info;
560                         struct list_head                bp_list;
561                         /*
562                          * Crufty hack to avoid the chicken and egg
563                          * problem hw_breakpoint has with context
564                          * creation and event initalization.
565                          */
566                         struct task_struct              *bp_target;
567                 };
568 #endif
569         };
570         int                             state;
571         local64_t                       prev_count;
572         u64                             sample_period;
573         u64                             last_period;
574         local64_t                       period_left;
575         u64                             interrupts;
576
577         u64                             freq_time_stamp;
578         u64                             freq_count_stamp;
579 #endif
580 };
581
582 /*
583  * hw_perf_event::state flags
584  */
585 #define PERF_HES_STOPPED        0x01 /* the counter is stopped */
586 #define PERF_HES_UPTODATE       0x02 /* event->count up-to-date */
587 #define PERF_HES_ARCH           0x04
588
589 struct perf_event;
590
591 /*
592  * Common implementation detail of pmu::{start,commit,cancel}_txn
593  */
594 #define PERF_EVENT_TXN 0x1
595
596 /**
597  * struct pmu - generic performance monitoring unit
598  */
599 struct pmu {
600         struct list_head                entry;
601
602         struct device                   *dev;
603         char                            *name;
604         int                             type;
605
606         int * __percpu                  pmu_disable_count;
607         struct perf_cpu_context * __percpu pmu_cpu_context;
608         int                             task_ctx_nr;
609
610         /*
611          * Fully disable/enable this PMU, can be used to protect from the PMI
612          * as well as for lazy/batch writing of the MSRs.
613          */
614         void (*pmu_enable)              (struct pmu *pmu); /* optional */
615         void (*pmu_disable)             (struct pmu *pmu); /* optional */
616
617         /*
618          * Try and initialize the event for this PMU.
619          * Should return -ENOENT when the @event doesn't match this PMU.
620          */
621         int (*event_init)               (struct perf_event *event);
622
623 #define PERF_EF_START   0x01            /* start the counter when adding    */
624 #define PERF_EF_RELOAD  0x02            /* reload the counter when starting */
625 #define PERF_EF_UPDATE  0x04            /* update the counter when stopping */
626
627         /*
628          * Adds/Removes a counter to/from the PMU, can be done inside
629          * a transaction, see the ->*_txn() methods.
630          */
631         int  (*add)                     (struct perf_event *event, int flags);
632         void (*del)                     (struct perf_event *event, int flags);
633
634         /*
635          * Starts/Stops a counter present on the PMU. The PMI handler
636          * should stop the counter when perf_event_overflow() returns
637          * !0. ->start() will be used to continue.
638          */
639         void (*start)                   (struct perf_event *event, int flags);
640         void (*stop)                    (struct perf_event *event, int flags);
641
642         /*
643          * Updates the counter value of the event.
644          */
645         void (*read)                    (struct perf_event *event);
646
647         /*
648          * Group events scheduling is treated as a transaction, add
649          * group events as a whole and perform one schedulability test.
650          * If the test fails, roll back the whole group
651          *
652          * Start the transaction, after this ->add() doesn't need to
653          * do schedulability tests.
654          */
655         void (*start_txn)       (struct pmu *pmu); /* optional */
656         /*
657          * If ->start_txn() disabled the ->add() schedulability test
658          * then ->commit_txn() is required to perform one. On success
659          * the transaction is closed. On error the transaction is kept
660          * open until ->cancel_txn() is called.
661          */
662         int  (*commit_txn)      (struct pmu *pmu); /* optional */
663         /*
664          * Will cancel the transaction, assumes ->del() is called
665          * for each successful ->add() during the transaction.
666          */
667         void (*cancel_txn)      (struct pmu *pmu); /* optional */
668 };
669
670 /**
671  * enum perf_event_active_state - the states of a event
672  */
673 enum perf_event_active_state {
674         PERF_EVENT_STATE_ERROR          = -2,
675         PERF_EVENT_STATE_OFF            = -1,
676         PERF_EVENT_STATE_INACTIVE       =  0,
677         PERF_EVENT_STATE_ACTIVE         =  1,
678 };
679
680 struct file;
681
682 #define PERF_BUFFER_WRITABLE            0x01
683
684 struct perf_buffer {
685         atomic_t                        refcount;
686         struct rcu_head                 rcu_head;
687 #ifdef CONFIG_PERF_USE_VMALLOC
688         struct work_struct              work;
689         int                             page_order;     /* allocation order  */
690 #endif
691         int                             nr_pages;       /* nr of data pages  */
692         int                             writable;       /* are we writable   */
693
694         atomic_t                        poll;           /* POLL_ for wakeups */
695
696         local_t                         head;           /* write position    */
697         local_t                         nest;           /* nested writers    */
698         local_t                         events;         /* event limit       */
699         local_t                         wakeup;         /* wakeup stamp      */
700         local_t                         lost;           /* nr records lost   */
701
702         long                            watermark;      /* wakeup watermark  */
703
704         struct perf_event_mmap_page     *user_page;
705         void                            *data_pages[0];
706 };
707
708 struct perf_sample_data;
709
710 typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
711                                         struct perf_sample_data *,
712                                         struct pt_regs *regs);
713
714 enum perf_group_flag {
715         PERF_GROUP_SOFTWARE = 0x1,
716 };
717
718 #define SWEVENT_HLIST_BITS      8
719 #define SWEVENT_HLIST_SIZE      (1 << SWEVENT_HLIST_BITS)
720
721 struct swevent_hlist {
722         struct hlist_head       heads[SWEVENT_HLIST_SIZE];
723         struct rcu_head         rcu_head;
724 };
725
726 #define PERF_ATTACH_CONTEXT     0x01
727 #define PERF_ATTACH_GROUP       0x02
728 #define PERF_ATTACH_TASK        0x04
729
730 #ifdef CONFIG_CGROUP_PERF
731 /*
732  * perf_cgroup_info keeps track of time_enabled for a cgroup.
733  * This is a per-cpu dynamically allocated data structure.
734  */
735 struct perf_cgroup_info {
736         u64 time;
737         u64 timestamp;
738 };
739
740 struct perf_cgroup {
741         struct cgroup_subsys_state css;
742         struct perf_cgroup_info *info;  /* timing info, one per cpu */
743 };
744 #endif
745
746 /**
747  * struct perf_event - performance event kernel representation:
748  */
749 struct perf_event {
750 #ifdef CONFIG_PERF_EVENTS
751         struct list_head                group_entry;
752         struct list_head                event_entry;
753         struct list_head                sibling_list;
754         struct hlist_node               hlist_entry;
755         int                             nr_siblings;
756         int                             group_flags;
757         struct perf_event               *group_leader;
758         struct pmu                      *pmu;
759
760         enum perf_event_active_state    state;
761         unsigned int                    attach_state;
762         local64_t                       count;
763         atomic64_t                      child_count;
764
765         /*
766          * These are the total time in nanoseconds that the event
767          * has been enabled (i.e. eligible to run, and the task has
768          * been scheduled in, if this is a per-task event)
769          * and running (scheduled onto the CPU), respectively.
770          *
771          * They are computed from tstamp_enabled, tstamp_running and
772          * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
773          */
774         u64                             total_time_enabled;
775         u64                             total_time_running;
776
777         /*
778          * These are timestamps used for computing total_time_enabled
779          * and total_time_running when the event is in INACTIVE or
780          * ACTIVE state, measured in nanoseconds from an arbitrary point
781          * in time.
782          * tstamp_enabled: the notional time when the event was enabled
783          * tstamp_running: the notional time when the event was scheduled on
784          * tstamp_stopped: in INACTIVE state, the notional time when the
785          *      event was scheduled off.
786          */
787         u64                             tstamp_enabled;
788         u64                             tstamp_running;
789         u64                             tstamp_stopped;
790
791         /*
792          * timestamp shadows the actual context timing but it can
793          * be safely used in NMI interrupt context. It reflects the
794          * context time as it was when the event was last scheduled in.
795          *
796          * ctx_time already accounts for ctx->timestamp. Therefore to
797          * compute ctx_time for a sample, simply add perf_clock().
798          */
799         u64                             shadow_ctx_time;
800
801         struct perf_event_attr          attr;
802         u16                             header_size;
803         u16                             id_header_size;
804         u16                             read_size;
805         struct hw_perf_event            hw;
806
807         struct perf_event_context       *ctx;
808         struct file                     *filp;
809
810         /*
811          * These accumulate total time (in nanoseconds) that children
812          * events have been enabled and running, respectively.
813          */
814         atomic64_t                      child_total_time_enabled;
815         atomic64_t                      child_total_time_running;
816
817         /*
818          * Protect attach/detach and child_list:
819          */
820         struct mutex                    child_mutex;
821         struct list_head                child_list;
822         struct perf_event               *parent;
823
824         int                             oncpu;
825         int                             cpu;
826
827         struct list_head                owner_entry;
828         struct task_struct              *owner;
829
830         /* mmap bits */
831         struct mutex                    mmap_mutex;
832         atomic_t                        mmap_count;
833         int                             mmap_locked;
834         struct user_struct              *mmap_user;
835         struct perf_buffer              *buffer;
836
837         /* poll related */
838         wait_queue_head_t               waitq;
839         struct fasync_struct            *fasync;
840
841         /* delayed work for NMIs and such */
842         int                             pending_wakeup;
843         int                             pending_kill;
844         int                             pending_disable;
845         struct irq_work                 pending;
846
847         atomic_t                        event_limit;
848
849         void (*destroy)(struct perf_event *);
850         struct rcu_head                 rcu_head;
851
852         struct pid_namespace            *ns;
853         u64                             id;
854
855         perf_overflow_handler_t         overflow_handler;
856
857 #ifdef CONFIG_EVENT_TRACING
858         struct ftrace_event_call        *tp_event;
859         struct event_filter             *filter;
860 #endif
861
862 #ifdef CONFIG_CGROUP_PERF
863         struct perf_cgroup              *cgrp; /* cgroup event is attach to */
864         int                             cgrp_defer_enabled;
865 #endif
866
867 #endif /* CONFIG_PERF_EVENTS */
868 };
869
870 enum perf_event_context_type {
871         task_context,
872         cpu_context,
873 };
874
875 /**
876  * struct perf_event_context - event context structure
877  *
878  * Used as a container for task events and CPU events as well:
879  */
880 struct perf_event_context {
881         struct pmu                      *pmu;
882         enum perf_event_context_type    type;
883         /*
884          * Protect the states of the events in the list,
885          * nr_active, and the list:
886          */
887         raw_spinlock_t                  lock;
888         /*
889          * Protect the list of events.  Locking either mutex or lock
890          * is sufficient to ensure the list doesn't change; to change
891          * the list you need to lock both the mutex and the spinlock.
892          */
893         struct mutex                    mutex;
894
895         struct list_head                pinned_groups;
896         struct list_head                flexible_groups;
897         struct list_head                event_list;
898         int                             nr_events;
899         int                             nr_active;
900         int                             is_active;
901         int                             nr_stat;
902         int                             rotate_disable;
903         atomic_t                        refcount;
904         struct task_struct              *task;
905
906         /*
907          * Context clock, runs when context enabled.
908          */
909         u64                             time;
910         u64                             timestamp;
911
912         /*
913          * These fields let us detect when two contexts have both
914          * been cloned (inherited) from a common ancestor.
915          */
916         struct perf_event_context       *parent_ctx;
917         u64                             parent_gen;
918         u64                             generation;
919         int                             pin_count;
920         struct rcu_head                 rcu_head;
921         int                             nr_cgroups; /* cgroup events present */
922 };
923
924 /*
925  * Number of contexts where an event can trigger:
926  *      task, softirq, hardirq, nmi.
927  */
928 #define PERF_NR_CONTEXTS        4
929
930 /**
931  * struct perf_event_cpu_context - per cpu event context structure
932  */
933 struct perf_cpu_context {
934         struct perf_event_context       ctx;
935         struct perf_event_context       *task_ctx;
936         int                             active_oncpu;
937         int                             exclusive;
938         struct list_head                rotation_list;
939         int                             jiffies_interval;
940         struct pmu                      *active_pmu;
941         struct perf_cgroup              *cgrp;
942 };
943
944 struct perf_output_handle {
945         struct perf_event               *event;
946         struct perf_buffer              *buffer;
947         unsigned long                   wakeup;
948         unsigned long                   size;
949         void                            *addr;
950         int                             page;
951         int                             nmi;
952         int                             sample;
953 };
954
955 #ifdef CONFIG_PERF_EVENTS
956
957 extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
958 extern void perf_pmu_unregister(struct pmu *pmu);
959
960 extern int perf_num_counters(void);
961 extern const char *perf_pmu_name(void);
962 extern void __perf_event_task_sched_in(struct task_struct *task);
963 extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
964 extern int perf_event_init_task(struct task_struct *child);
965 extern void perf_event_exit_task(struct task_struct *child);
966 extern void perf_event_free_task(struct task_struct *task);
967 extern void perf_event_delayed_put(struct task_struct *task);
968 extern void perf_event_print_debug(void);
969 extern void perf_pmu_disable(struct pmu *pmu);
970 extern void perf_pmu_enable(struct pmu *pmu);
971 extern int perf_event_task_disable(void);
972 extern int perf_event_task_enable(void);
973 extern void perf_event_update_userpage(struct perf_event *event);
974 extern int perf_event_release_kernel(struct perf_event *event);
975 extern struct perf_event *
976 perf_event_create_kernel_counter(struct perf_event_attr *attr,
977                                 int cpu,
978                                 struct task_struct *task,
979                                 perf_overflow_handler_t callback);
980 extern u64 perf_event_read_value(struct perf_event *event,
981                                  u64 *enabled, u64 *running);
982
983 struct perf_sample_data {
984         u64                             type;
985
986         u64                             ip;
987         struct {
988                 u32     pid;
989                 u32     tid;
990         }                               tid_entry;
991         u64                             time;
992         u64                             addr;
993         u64                             id;
994         u64                             stream_id;
995         struct {
996                 u32     cpu;
997                 u32     reserved;
998         }                               cpu_entry;
999         u64                             period;
1000         struct perf_callchain_entry     *callchain;
1001         struct perf_raw_record          *raw;
1002 };
1003
1004 static inline
1005 void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
1006 {
1007         data->addr = addr;
1008         data->raw  = NULL;
1009 }
1010
1011 extern void perf_output_sample(struct perf_output_handle *handle,
1012                                struct perf_event_header *header,
1013                                struct perf_sample_data *data,
1014                                struct perf_event *event);
1015 extern void perf_prepare_sample(struct perf_event_header *header,
1016                                 struct perf_sample_data *data,
1017                                 struct perf_event *event,
1018                                 struct pt_regs *regs);
1019
1020 extern int perf_event_overflow(struct perf_event *event, int nmi,
1021                                  struct perf_sample_data *data,
1022                                  struct pt_regs *regs);
1023
1024 static inline bool is_sampling_event(struct perf_event *event)
1025 {
1026         return event->attr.sample_period != 0;
1027 }
1028
1029 /*
1030  * Return 1 for a software event, 0 for a hardware event
1031  */
1032 static inline int is_software_event(struct perf_event *event)
1033 {
1034         return event->pmu->task_ctx_nr == perf_sw_context;
1035 }
1036
1037 extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
1038
1039 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
1040
1041 #ifndef perf_arch_fetch_caller_regs
1042 static inline void
1043 perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
1044 #endif
1045
1046 /*
1047  * Take a snapshot of the regs. Skip ip and frame pointer to
1048  * the nth caller. We only need a few of the regs:
1049  * - ip for PERF_SAMPLE_IP
1050  * - cs for user_mode() tests
1051  * - bp for callchains
1052  * - eflags, for future purposes, just in case
1053  */
1054 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
1055 {
1056         memset(regs, 0, sizeof(*regs));
1057
1058         perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
1059 }
1060
1061 static __always_inline void
1062 perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
1063 {
1064         struct pt_regs hot_regs;
1065
1066         JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
1067         return;
1068
1069 have_event:
1070         if (!regs) {
1071                 perf_fetch_caller_regs(&hot_regs);
1072                 regs = &hot_regs;
1073         }
1074         __perf_sw_event(event_id, nr, nmi, regs, addr);
1075 }
1076
1077 extern atomic_t perf_sched_events;
1078
1079 static inline void perf_event_task_sched_in(struct task_struct *task)
1080 {
1081         COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
1082 }
1083
1084 static inline
1085 void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
1086 {
1087         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1088
1089         __perf_event_task_sched_out(task, next);
1090 }
1091
1092 extern void perf_event_mmap(struct vm_area_struct *vma);
1093 extern struct perf_guest_info_callbacks *perf_guest_cbs;
1094 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1095 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
1096
1097 extern void perf_event_comm(struct task_struct *tsk);
1098 extern void perf_event_fork(struct task_struct *tsk);
1099
1100 /* Callchains */
1101 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
1102
1103 extern void perf_callchain_user(struct perf_callchain_entry *entry,
1104                                 struct pt_regs *regs);
1105 extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
1106                                   struct pt_regs *regs);
1107
1108
1109 static inline void
1110 perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
1111 {
1112         if (entry->nr < PERF_MAX_STACK_DEPTH)
1113                 entry->ip[entry->nr++] = ip;
1114 }
1115
1116 extern int sysctl_perf_event_paranoid;
1117 extern int sysctl_perf_event_mlock;
1118 extern int sysctl_perf_event_sample_rate;
1119
1120 extern int perf_proc_update_handler(struct ctl_table *table, int write,
1121                 void __user *buffer, size_t *lenp,
1122                 loff_t *ppos);
1123
1124 static inline bool perf_paranoid_tracepoint_raw(void)
1125 {
1126         return sysctl_perf_event_paranoid > -1;
1127 }
1128
1129 static inline bool perf_paranoid_cpu(void)
1130 {
1131         return sysctl_perf_event_paranoid > 0;
1132 }
1133
1134 static inline bool perf_paranoid_kernel(void)
1135 {
1136         return sysctl_perf_event_paranoid > 1;
1137 }
1138
1139 extern void perf_event_init(void);
1140 extern void perf_tp_event(u64 addr, u64 count, void *record,
1141                           int entry_size, struct pt_regs *regs,
1142                           struct hlist_head *head, int rctx);
1143 extern void perf_bp_event(struct perf_event *event, void *data);
1144
1145 #ifndef perf_misc_flags
1146 #define perf_misc_flags(regs)   (user_mode(regs) ? PERF_RECORD_MISC_USER : \
1147                                  PERF_RECORD_MISC_KERNEL)
1148 #define perf_instruction_pointer(regs)  instruction_pointer(regs)
1149 #endif
1150
1151 extern int perf_output_begin(struct perf_output_handle *handle,
1152                              struct perf_event *event, unsigned int size,
1153                              int nmi, int sample);
1154 extern void perf_output_end(struct perf_output_handle *handle);
1155 extern void perf_output_copy(struct perf_output_handle *handle,
1156                              const void *buf, unsigned int len);
1157 extern int perf_swevent_get_recursion_context(void);
1158 extern void perf_swevent_put_recursion_context(int rctx);
1159 extern void perf_event_enable(struct perf_event *event);
1160 extern void perf_event_disable(struct perf_event *event);
1161 extern void perf_event_task_tick(void);
1162 #else
1163 static inline void
1164 perf_event_task_sched_in(struct task_struct *task)                      { }
1165 static inline void
1166 perf_event_task_sched_out(struct task_struct *task,
1167                             struct task_struct *next)                   { }
1168 static inline int perf_event_init_task(struct task_struct *child)       { return 0; }
1169 static inline void perf_event_exit_task(struct task_struct *child)      { }
1170 static inline void perf_event_free_task(struct task_struct *task)       { }
1171 static inline void perf_event_delayed_put(struct task_struct *task)     { }
1172 static inline void perf_event_print_debug(void)                         { }
1173 static inline int perf_event_task_disable(void)                         { return -EINVAL; }
1174 static inline int perf_event_task_enable(void)                          { return -EINVAL; }
1175
1176 static inline void
1177 perf_sw_event(u32 event_id, u64 nr, int nmi,
1178                      struct pt_regs *regs, u64 addr)                    { }
1179 static inline void
1180 perf_bp_event(struct perf_event *event, void *data)                     { }
1181
1182 static inline int perf_register_guest_info_callbacks
1183 (struct perf_guest_info_callbacks *callbacks) { return 0; }
1184 static inline int perf_unregister_guest_info_callbacks
1185 (struct perf_guest_info_callbacks *callbacks) { return 0; }
1186
1187 static inline void perf_event_mmap(struct vm_area_struct *vma)          { }
1188 static inline void perf_event_comm(struct task_struct *tsk)             { }
1189 static inline void perf_event_fork(struct task_struct *tsk)             { }
1190 static inline void perf_event_init(void)                                { }
1191 static inline int  perf_swevent_get_recursion_context(void)             { return -1; }
1192 static inline void perf_swevent_put_recursion_context(int rctx)         { }
1193 static inline void perf_event_enable(struct perf_event *event)          { }
1194 static inline void perf_event_disable(struct perf_event *event)         { }
1195 static inline void perf_event_task_tick(void)                           { }
1196 #endif
1197
1198 #define perf_output_put(handle, x) \
1199         perf_output_copy((handle), &(x), sizeof(x))
1200
1201 /*
1202  * This has to have a higher priority than migration_notifier in sched.c.
1203  */
1204 #define perf_cpu_notifier(fn)                                   \
1205 do {                                                            \
1206         static struct notifier_block fn##_nb __cpuinitdata =    \
1207                 { .notifier_call = fn, .priority = CPU_PRI_PERF }; \
1208         fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE,             \
1209                 (void *)(unsigned long)smp_processor_id());     \
1210         fn(&fn##_nb, (unsigned long)CPU_STARTING,               \
1211                 (void *)(unsigned long)smp_processor_id());     \
1212         fn(&fn##_nb, (unsigned long)CPU_ONLINE,                 \
1213                 (void *)(unsigned long)smp_processor_id());     \
1214         register_cpu_notifier(&fn##_nb);                        \
1215 } while (0)
1216
1217 #endif /* __KERNEL__ */
1218 #endif /* _LINUX_PERF_EVENT_H */