11847bf1e8cc254db7f2a2a255511fd36eea4a68
[linux-2.6.git] / kernel / perf_event.c
1 /*
2  * Performance events core code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/sysfs.h>
22 #include <linux/dcache.h>
23 #include <linux/percpu.h>
24 #include <linux/ptrace.h>
25 #include <linux/reboot.h>
26 #include <linux/vmstat.h>
27 #include <linux/device.h>
28 #include <linux/vmalloc.h>
29 #include <linux/hardirq.h>
30 #include <linux/rculist.h>
31 #include <linux/uaccess.h>
32 #include <linux/syscalls.h>
33 #include <linux/anon_inodes.h>
34 #include <linux/kernel_stat.h>
35 #include <linux/perf_event.h>
36 #include <linux/ftrace_event.h>
37 #include <linux/hw_breakpoint.h>
38
39 #include <asm/irq_regs.h>
40
41 atomic_t perf_task_events __read_mostly;
42 static atomic_t nr_mmap_events __read_mostly;
43 static atomic_t nr_comm_events __read_mostly;
44 static atomic_t nr_task_events __read_mostly;
45
46 static LIST_HEAD(pmus);
47 static DEFINE_MUTEX(pmus_lock);
48 static struct srcu_struct pmus_srcu;
49
50 /*
51  * perf event paranoia level:
52  *  -1 - not paranoid at all
53  *   0 - disallow raw tracepoint access for unpriv
54  *   1 - disallow cpu events for unpriv
55  *   2 - disallow kernel profiling for unpriv
56  */
57 int sysctl_perf_event_paranoid __read_mostly = 1;
58
59 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
60
61 /*
62  * max perf event sample rate
63  */
64 int sysctl_perf_event_sample_rate __read_mostly = 100000;
65
66 static atomic64_t perf_event_id;
67
68 void __weak perf_event_print_debug(void)        { }
69
70 extern __weak const char *perf_pmu_name(void)
71 {
72         return "pmu";
73 }
74
75 void perf_pmu_disable(struct pmu *pmu)
76 {
77         int *count = this_cpu_ptr(pmu->pmu_disable_count);
78         if (!(*count)++)
79                 pmu->pmu_disable(pmu);
80 }
81
82 void perf_pmu_enable(struct pmu *pmu)
83 {
84         int *count = this_cpu_ptr(pmu->pmu_disable_count);
85         if (!--(*count))
86                 pmu->pmu_enable(pmu);
87 }
88
89 static DEFINE_PER_CPU(struct list_head, rotation_list);
90
91 /*
92  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
93  * because they're strictly cpu affine and rotate_start is called with IRQs
94  * disabled, while rotate_context is called from IRQ context.
95  */
96 static void perf_pmu_rotate_start(struct pmu *pmu)
97 {
98         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99         struct list_head *head = &__get_cpu_var(rotation_list);
100
101         WARN_ON(!irqs_disabled());
102
103         if (list_empty(&cpuctx->rotation_list))
104                 list_add(&cpuctx->rotation_list, head);
105 }
106
107 static void get_ctx(struct perf_event_context *ctx)
108 {
109         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
110 }
111
112 static void free_ctx(struct rcu_head *head)
113 {
114         struct perf_event_context *ctx;
115
116         ctx = container_of(head, struct perf_event_context, rcu_head);
117         kfree(ctx);
118 }
119
120 static void put_ctx(struct perf_event_context *ctx)
121 {
122         if (atomic_dec_and_test(&ctx->refcount)) {
123                 if (ctx->parent_ctx)
124                         put_ctx(ctx->parent_ctx);
125                 if (ctx->task)
126                         put_task_struct(ctx->task);
127                 call_rcu(&ctx->rcu_head, free_ctx);
128         }
129 }
130
131 static void unclone_ctx(struct perf_event_context *ctx)
132 {
133         if (ctx->parent_ctx) {
134                 put_ctx(ctx->parent_ctx);
135                 ctx->parent_ctx = NULL;
136         }
137 }
138
139 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
140 {
141         /*
142          * only top level events have the pid namespace they were created in
143          */
144         if (event->parent)
145                 event = event->parent;
146
147         return task_tgid_nr_ns(p, event->ns);
148 }
149
150 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
151 {
152         /*
153          * only top level events have the pid namespace they were created in
154          */
155         if (event->parent)
156                 event = event->parent;
157
158         return task_pid_nr_ns(p, event->ns);
159 }
160
161 /*
162  * If we inherit events we want to return the parent event id
163  * to userspace.
164  */
165 static u64 primary_event_id(struct perf_event *event)
166 {
167         u64 id = event->id;
168
169         if (event->parent)
170                 id = event->parent->id;
171
172         return id;
173 }
174
175 /*
176  * Get the perf_event_context for a task and lock it.
177  * This has to cope with with the fact that until it is locked,
178  * the context could get moved to another task.
179  */
180 static struct perf_event_context *
181 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
182 {
183         struct perf_event_context *ctx;
184
185         rcu_read_lock();
186 retry:
187         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
188         if (ctx) {
189                 /*
190                  * If this context is a clone of another, it might
191                  * get swapped for another underneath us by
192                  * perf_event_task_sched_out, though the
193                  * rcu_read_lock() protects us from any context
194                  * getting freed.  Lock the context and check if it
195                  * got swapped before we could get the lock, and retry
196                  * if so.  If we locked the right context, then it
197                  * can't get swapped on us any more.
198                  */
199                 raw_spin_lock_irqsave(&ctx->lock, *flags);
200                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
201                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
202                         goto retry;
203                 }
204
205                 if (!atomic_inc_not_zero(&ctx->refcount)) {
206                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
207                         ctx = NULL;
208                 }
209         }
210         rcu_read_unlock();
211         return ctx;
212 }
213
214 /*
215  * Get the context for a task and increment its pin_count so it
216  * can't get swapped to another task.  This also increments its
217  * reference count so that the context can't get freed.
218  */
219 static struct perf_event_context *
220 perf_pin_task_context(struct task_struct *task, int ctxn)
221 {
222         struct perf_event_context *ctx;
223         unsigned long flags;
224
225         ctx = perf_lock_task_context(task, ctxn, &flags);
226         if (ctx) {
227                 ++ctx->pin_count;
228                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
229         }
230         return ctx;
231 }
232
233 static void perf_unpin_context(struct perf_event_context *ctx)
234 {
235         unsigned long flags;
236
237         raw_spin_lock_irqsave(&ctx->lock, flags);
238         --ctx->pin_count;
239         raw_spin_unlock_irqrestore(&ctx->lock, flags);
240         put_ctx(ctx);
241 }
242
243 static inline u64 perf_clock(void)
244 {
245         return local_clock();
246 }
247
248 /*
249  * Update the record of the current time in a context.
250  */
251 static void update_context_time(struct perf_event_context *ctx)
252 {
253         u64 now = perf_clock();
254
255         ctx->time += now - ctx->timestamp;
256         ctx->timestamp = now;
257 }
258
259 /*
260  * Update the total_time_enabled and total_time_running fields for a event.
261  */
262 static void update_event_times(struct perf_event *event)
263 {
264         struct perf_event_context *ctx = event->ctx;
265         u64 run_end;
266
267         if (event->state < PERF_EVENT_STATE_INACTIVE ||
268             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
269                 return;
270
271         if (ctx->is_active)
272                 run_end = ctx->time;
273         else
274                 run_end = event->tstamp_stopped;
275
276         event->total_time_enabled = run_end - event->tstamp_enabled;
277
278         if (event->state == PERF_EVENT_STATE_INACTIVE)
279                 run_end = event->tstamp_stopped;
280         else
281                 run_end = ctx->time;
282
283         event->total_time_running = run_end - event->tstamp_running;
284 }
285
286 /*
287  * Update total_time_enabled and total_time_running for all events in a group.
288  */
289 static void update_group_times(struct perf_event *leader)
290 {
291         struct perf_event *event;
292
293         update_event_times(leader);
294         list_for_each_entry(event, &leader->sibling_list, group_entry)
295                 update_event_times(event);
296 }
297
298 static struct list_head *
299 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
300 {
301         if (event->attr.pinned)
302                 return &ctx->pinned_groups;
303         else
304                 return &ctx->flexible_groups;
305 }
306
307 /*
308  * Add a event from the lists for its context.
309  * Must be called with ctx->mutex and ctx->lock held.
310  */
311 static void
312 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
313 {
314         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
315         event->attach_state |= PERF_ATTACH_CONTEXT;
316
317         /*
318          * If we're a stand alone event or group leader, we go to the context
319          * list, group events are kept attached to the group so that
320          * perf_group_detach can, at all times, locate all siblings.
321          */
322         if (event->group_leader == event) {
323                 struct list_head *list;
324
325                 if (is_software_event(event))
326                         event->group_flags |= PERF_GROUP_SOFTWARE;
327
328                 list = ctx_group_list(event, ctx);
329                 list_add_tail(&event->group_entry, list);
330         }
331
332         list_add_rcu(&event->event_entry, &ctx->event_list);
333         if (!ctx->nr_events)
334                 perf_pmu_rotate_start(ctx->pmu);
335         ctx->nr_events++;
336         if (event->attr.inherit_stat)
337                 ctx->nr_stat++;
338 }
339
340 /*
341  * Called at perf_event creation and when events are attached/detached from a
342  * group.
343  */
344 static void perf_event__read_size(struct perf_event *event)
345 {
346         int entry = sizeof(u64); /* value */
347         int size = 0;
348         int nr = 1;
349
350         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
351                 size += sizeof(u64);
352
353         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
354                 size += sizeof(u64);
355
356         if (event->attr.read_format & PERF_FORMAT_ID)
357                 entry += sizeof(u64);
358
359         if (event->attr.read_format & PERF_FORMAT_GROUP) {
360                 nr += event->group_leader->nr_siblings;
361                 size += sizeof(u64);
362         }
363
364         size += entry * nr;
365         event->read_size = size;
366 }
367
368 static void perf_event__header_size(struct perf_event *event)
369 {
370         struct perf_sample_data *data;
371         u64 sample_type = event->attr.sample_type;
372         u16 size = 0;
373
374         perf_event__read_size(event);
375
376         if (sample_type & PERF_SAMPLE_IP)
377                 size += sizeof(data->ip);
378
379         if (sample_type & PERF_SAMPLE_ADDR)
380                 size += sizeof(data->addr);
381
382         if (sample_type & PERF_SAMPLE_PERIOD)
383                 size += sizeof(data->period);
384
385         if (sample_type & PERF_SAMPLE_READ)
386                 size += event->read_size;
387
388         event->header_size = size;
389 }
390
391 static void perf_event__id_header_size(struct perf_event *event)
392 {
393         struct perf_sample_data *data;
394         u64 sample_type = event->attr.sample_type;
395         u16 size = 0;
396
397         if (sample_type & PERF_SAMPLE_TID)
398                 size += sizeof(data->tid_entry);
399
400         if (sample_type & PERF_SAMPLE_TIME)
401                 size += sizeof(data->time);
402
403         if (sample_type & PERF_SAMPLE_ID)
404                 size += sizeof(data->id);
405
406         if (sample_type & PERF_SAMPLE_STREAM_ID)
407                 size += sizeof(data->stream_id);
408
409         if (sample_type & PERF_SAMPLE_CPU)
410                 size += sizeof(data->cpu_entry);
411
412         event->id_header_size = size;
413 }
414
415 static void perf_group_attach(struct perf_event *event)
416 {
417         struct perf_event *group_leader = event->group_leader, *pos;
418
419         /*
420          * We can have double attach due to group movement in perf_event_open.
421          */
422         if (event->attach_state & PERF_ATTACH_GROUP)
423                 return;
424
425         event->attach_state |= PERF_ATTACH_GROUP;
426
427         if (group_leader == event)
428                 return;
429
430         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
431                         !is_software_event(event))
432                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
433
434         list_add_tail(&event->group_entry, &group_leader->sibling_list);
435         group_leader->nr_siblings++;
436
437         perf_event__header_size(group_leader);
438
439         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
440                 perf_event__header_size(pos);
441 }
442
443 /*
444  * Remove a event from the lists for its context.
445  * Must be called with ctx->mutex and ctx->lock held.
446  */
447 static void
448 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
449 {
450         /*
451          * We can have double detach due to exit/hot-unplug + close.
452          */
453         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
454                 return;
455
456         event->attach_state &= ~PERF_ATTACH_CONTEXT;
457
458         ctx->nr_events--;
459         if (event->attr.inherit_stat)
460                 ctx->nr_stat--;
461
462         list_del_rcu(&event->event_entry);
463
464         if (event->group_leader == event)
465                 list_del_init(&event->group_entry);
466
467         update_group_times(event);
468
469         /*
470          * If event was in error state, then keep it
471          * that way, otherwise bogus counts will be
472          * returned on read(). The only way to get out
473          * of error state is by explicit re-enabling
474          * of the event
475          */
476         if (event->state > PERF_EVENT_STATE_OFF)
477                 event->state = PERF_EVENT_STATE_OFF;
478 }
479
480 static void perf_group_detach(struct perf_event *event)
481 {
482         struct perf_event *sibling, *tmp;
483         struct list_head *list = NULL;
484
485         /*
486          * We can have double detach due to exit/hot-unplug + close.
487          */
488         if (!(event->attach_state & PERF_ATTACH_GROUP))
489                 return;
490
491         event->attach_state &= ~PERF_ATTACH_GROUP;
492
493         /*
494          * If this is a sibling, remove it from its group.
495          */
496         if (event->group_leader != event) {
497                 list_del_init(&event->group_entry);
498                 event->group_leader->nr_siblings--;
499                 goto out;
500         }
501
502         if (!list_empty(&event->group_entry))
503                 list = &event->group_entry;
504
505         /*
506          * If this was a group event with sibling events then
507          * upgrade the siblings to singleton events by adding them
508          * to whatever list we are on.
509          */
510         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
511                 if (list)
512                         list_move_tail(&sibling->group_entry, list);
513                 sibling->group_leader = sibling;
514
515                 /* Inherit group flags from the previous leader */
516                 sibling->group_flags = event->group_flags;
517         }
518
519 out:
520         perf_event__header_size(event->group_leader);
521
522         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
523                 perf_event__header_size(tmp);
524 }
525
526 static inline int
527 event_filter_match(struct perf_event *event)
528 {
529         return event->cpu == -1 || event->cpu == smp_processor_id();
530 }
531
532 static void
533 event_sched_out(struct perf_event *event,
534                   struct perf_cpu_context *cpuctx,
535                   struct perf_event_context *ctx)
536 {
537         u64 delta;
538         /*
539          * An event which could not be activated because of
540          * filter mismatch still needs to have its timings
541          * maintained, otherwise bogus information is return
542          * via read() for time_enabled, time_running:
543          */
544         if (event->state == PERF_EVENT_STATE_INACTIVE
545             && !event_filter_match(event)) {
546                 delta = ctx->time - event->tstamp_stopped;
547                 event->tstamp_running += delta;
548                 event->tstamp_stopped = ctx->time;
549         }
550
551         if (event->state != PERF_EVENT_STATE_ACTIVE)
552                 return;
553
554         event->state = PERF_EVENT_STATE_INACTIVE;
555         if (event->pending_disable) {
556                 event->pending_disable = 0;
557                 event->state = PERF_EVENT_STATE_OFF;
558         }
559         event->tstamp_stopped = ctx->time;
560         event->pmu->del(event, 0);
561         event->oncpu = -1;
562
563         if (!is_software_event(event))
564                 cpuctx->active_oncpu--;
565         ctx->nr_active--;
566         if (event->attr.exclusive || !cpuctx->active_oncpu)
567                 cpuctx->exclusive = 0;
568 }
569
570 static void
571 group_sched_out(struct perf_event *group_event,
572                 struct perf_cpu_context *cpuctx,
573                 struct perf_event_context *ctx)
574 {
575         struct perf_event *event;
576         int state = group_event->state;
577
578         event_sched_out(group_event, cpuctx, ctx);
579
580         /*
581          * Schedule out siblings (if any):
582          */
583         list_for_each_entry(event, &group_event->sibling_list, group_entry)
584                 event_sched_out(event, cpuctx, ctx);
585
586         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
587                 cpuctx->exclusive = 0;
588 }
589
590 static inline struct perf_cpu_context *
591 __get_cpu_context(struct perf_event_context *ctx)
592 {
593         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
594 }
595
596 /*
597  * Cross CPU call to remove a performance event
598  *
599  * We disable the event on the hardware level first. After that we
600  * remove it from the context list.
601  */
602 static void __perf_event_remove_from_context(void *info)
603 {
604         struct perf_event *event = info;
605         struct perf_event_context *ctx = event->ctx;
606         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
607
608         /*
609          * If this is a task context, we need to check whether it is
610          * the current task context of this cpu. If not it has been
611          * scheduled out before the smp call arrived.
612          */
613         if (ctx->task && cpuctx->task_ctx != ctx)
614                 return;
615
616         raw_spin_lock(&ctx->lock);
617
618         event_sched_out(event, cpuctx, ctx);
619
620         list_del_event(event, ctx);
621
622         raw_spin_unlock(&ctx->lock);
623 }
624
625
626 /*
627  * Remove the event from a task's (or a CPU's) list of events.
628  *
629  * Must be called with ctx->mutex held.
630  *
631  * CPU events are removed with a smp call. For task events we only
632  * call when the task is on a CPU.
633  *
634  * If event->ctx is a cloned context, callers must make sure that
635  * every task struct that event->ctx->task could possibly point to
636  * remains valid.  This is OK when called from perf_release since
637  * that only calls us on the top-level context, which can't be a clone.
638  * When called from perf_event_exit_task, it's OK because the
639  * context has been detached from its task.
640  */
641 static void perf_event_remove_from_context(struct perf_event *event)
642 {
643         struct perf_event_context *ctx = event->ctx;
644         struct task_struct *task = ctx->task;
645
646         if (!task) {
647                 /*
648                  * Per cpu events are removed via an smp call and
649                  * the removal is always successful.
650                  */
651                 smp_call_function_single(event->cpu,
652                                          __perf_event_remove_from_context,
653                                          event, 1);
654                 return;
655         }
656
657 retry:
658         task_oncpu_function_call(task, __perf_event_remove_from_context,
659                                  event);
660
661         raw_spin_lock_irq(&ctx->lock);
662         /*
663          * If the context is active we need to retry the smp call.
664          */
665         if (ctx->nr_active && !list_empty(&event->group_entry)) {
666                 raw_spin_unlock_irq(&ctx->lock);
667                 goto retry;
668         }
669
670         /*
671          * The lock prevents that this context is scheduled in so we
672          * can remove the event safely, if the call above did not
673          * succeed.
674          */
675         if (!list_empty(&event->group_entry))
676                 list_del_event(event, ctx);
677         raw_spin_unlock_irq(&ctx->lock);
678 }
679
680 /*
681  * Cross CPU call to disable a performance event
682  */
683 static void __perf_event_disable(void *info)
684 {
685         struct perf_event *event = info;
686         struct perf_event_context *ctx = event->ctx;
687         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
688
689         /*
690          * If this is a per-task event, need to check whether this
691          * event's task is the current task on this cpu.
692          */
693         if (ctx->task && cpuctx->task_ctx != ctx)
694                 return;
695
696         raw_spin_lock(&ctx->lock);
697
698         /*
699          * If the event is on, turn it off.
700          * If it is in error state, leave it in error state.
701          */
702         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
703                 update_context_time(ctx);
704                 update_group_times(event);
705                 if (event == event->group_leader)
706                         group_sched_out(event, cpuctx, ctx);
707                 else
708                         event_sched_out(event, cpuctx, ctx);
709                 event->state = PERF_EVENT_STATE_OFF;
710         }
711
712         raw_spin_unlock(&ctx->lock);
713 }
714
715 /*
716  * Disable a event.
717  *
718  * If event->ctx is a cloned context, callers must make sure that
719  * every task struct that event->ctx->task could possibly point to
720  * remains valid.  This condition is satisifed when called through
721  * perf_event_for_each_child or perf_event_for_each because they
722  * hold the top-level event's child_mutex, so any descendant that
723  * goes to exit will block in sync_child_event.
724  * When called from perf_pending_event it's OK because event->ctx
725  * is the current context on this CPU and preemption is disabled,
726  * hence we can't get into perf_event_task_sched_out for this context.
727  */
728 void perf_event_disable(struct perf_event *event)
729 {
730         struct perf_event_context *ctx = event->ctx;
731         struct task_struct *task = ctx->task;
732
733         if (!task) {
734                 /*
735                  * Disable the event on the cpu that it's on
736                  */
737                 smp_call_function_single(event->cpu, __perf_event_disable,
738                                          event, 1);
739                 return;
740         }
741
742 retry:
743         task_oncpu_function_call(task, __perf_event_disable, event);
744
745         raw_spin_lock_irq(&ctx->lock);
746         /*
747          * If the event is still active, we need to retry the cross-call.
748          */
749         if (event->state == PERF_EVENT_STATE_ACTIVE) {
750                 raw_spin_unlock_irq(&ctx->lock);
751                 goto retry;
752         }
753
754         /*
755          * Since we have the lock this context can't be scheduled
756          * in, so we can change the state safely.
757          */
758         if (event->state == PERF_EVENT_STATE_INACTIVE) {
759                 update_group_times(event);
760                 event->state = PERF_EVENT_STATE_OFF;
761         }
762
763         raw_spin_unlock_irq(&ctx->lock);
764 }
765
766 static int
767 event_sched_in(struct perf_event *event,
768                  struct perf_cpu_context *cpuctx,
769                  struct perf_event_context *ctx)
770 {
771         if (event->state <= PERF_EVENT_STATE_OFF)
772                 return 0;
773
774         event->state = PERF_EVENT_STATE_ACTIVE;
775         event->oncpu = smp_processor_id();
776         /*
777          * The new state must be visible before we turn it on in the hardware:
778          */
779         smp_wmb();
780
781         if (event->pmu->add(event, PERF_EF_START)) {
782                 event->state = PERF_EVENT_STATE_INACTIVE;
783                 event->oncpu = -1;
784                 return -EAGAIN;
785         }
786
787         event->tstamp_running += ctx->time - event->tstamp_stopped;
788
789         event->shadow_ctx_time = ctx->time - ctx->timestamp;
790
791         if (!is_software_event(event))
792                 cpuctx->active_oncpu++;
793         ctx->nr_active++;
794
795         if (event->attr.exclusive)
796                 cpuctx->exclusive = 1;
797
798         return 0;
799 }
800
801 static int
802 group_sched_in(struct perf_event *group_event,
803                struct perf_cpu_context *cpuctx,
804                struct perf_event_context *ctx)
805 {
806         struct perf_event *event, *partial_group = NULL;
807         struct pmu *pmu = group_event->pmu;
808         u64 now = ctx->time;
809         bool simulate = false;
810
811         if (group_event->state == PERF_EVENT_STATE_OFF)
812                 return 0;
813
814         pmu->start_txn(pmu);
815
816         if (event_sched_in(group_event, cpuctx, ctx)) {
817                 pmu->cancel_txn(pmu);
818                 return -EAGAIN;
819         }
820
821         /*
822          * Schedule in siblings as one group (if any):
823          */
824         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
825                 if (event_sched_in(event, cpuctx, ctx)) {
826                         partial_group = event;
827                         goto group_error;
828                 }
829         }
830
831         if (!pmu->commit_txn(pmu))
832                 return 0;
833
834 group_error:
835         /*
836          * Groups can be scheduled in as one unit only, so undo any
837          * partial group before returning:
838          * The events up to the failed event are scheduled out normally,
839          * tstamp_stopped will be updated.
840          *
841          * The failed events and the remaining siblings need to have
842          * their timings updated as if they had gone thru event_sched_in()
843          * and event_sched_out(). This is required to get consistent timings
844          * across the group. This also takes care of the case where the group
845          * could never be scheduled by ensuring tstamp_stopped is set to mark
846          * the time the event was actually stopped, such that time delta
847          * calculation in update_event_times() is correct.
848          */
849         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
850                 if (event == partial_group)
851                         simulate = true;
852
853                 if (simulate) {
854                         event->tstamp_running += now - event->tstamp_stopped;
855                         event->tstamp_stopped = now;
856                 } else {
857                         event_sched_out(event, cpuctx, ctx);
858                 }
859         }
860         event_sched_out(group_event, cpuctx, ctx);
861
862         pmu->cancel_txn(pmu);
863
864         return -EAGAIN;
865 }
866
867 /*
868  * Work out whether we can put this event group on the CPU now.
869  */
870 static int group_can_go_on(struct perf_event *event,
871                            struct perf_cpu_context *cpuctx,
872                            int can_add_hw)
873 {
874         /*
875          * Groups consisting entirely of software events can always go on.
876          */
877         if (event->group_flags & PERF_GROUP_SOFTWARE)
878                 return 1;
879         /*
880          * If an exclusive group is already on, no other hardware
881          * events can go on.
882          */
883         if (cpuctx->exclusive)
884                 return 0;
885         /*
886          * If this group is exclusive and there are already
887          * events on the CPU, it can't go on.
888          */
889         if (event->attr.exclusive && cpuctx->active_oncpu)
890                 return 0;
891         /*
892          * Otherwise, try to add it if all previous groups were able
893          * to go on.
894          */
895         return can_add_hw;
896 }
897
898 static void add_event_to_ctx(struct perf_event *event,
899                                struct perf_event_context *ctx)
900 {
901         list_add_event(event, ctx);
902         perf_group_attach(event);
903         event->tstamp_enabled = ctx->time;
904         event->tstamp_running = ctx->time;
905         event->tstamp_stopped = ctx->time;
906 }
907
908 /*
909  * Cross CPU call to install and enable a performance event
910  *
911  * Must be called with ctx->mutex held
912  */
913 static void __perf_install_in_context(void *info)
914 {
915         struct perf_event *event = info;
916         struct perf_event_context *ctx = event->ctx;
917         struct perf_event *leader = event->group_leader;
918         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
919         int err;
920
921         /*
922          * If this is a task context, we need to check whether it is
923          * the current task context of this cpu. If not it has been
924          * scheduled out before the smp call arrived.
925          * Or possibly this is the right context but it isn't
926          * on this cpu because it had no events.
927          */
928         if (ctx->task && cpuctx->task_ctx != ctx) {
929                 if (cpuctx->task_ctx || ctx->task != current)
930                         return;
931                 cpuctx->task_ctx = ctx;
932         }
933
934         raw_spin_lock(&ctx->lock);
935         ctx->is_active = 1;
936         update_context_time(ctx);
937
938         add_event_to_ctx(event, ctx);
939
940         if (event->cpu != -1 && event->cpu != smp_processor_id())
941                 goto unlock;
942
943         /*
944          * Don't put the event on if it is disabled or if
945          * it is in a group and the group isn't on.
946          */
947         if (event->state != PERF_EVENT_STATE_INACTIVE ||
948             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
949                 goto unlock;
950
951         /*
952          * An exclusive event can't go on if there are already active
953          * hardware events, and no hardware event can go on if there
954          * is already an exclusive event on.
955          */
956         if (!group_can_go_on(event, cpuctx, 1))
957                 err = -EEXIST;
958         else
959                 err = event_sched_in(event, cpuctx, ctx);
960
961         if (err) {
962                 /*
963                  * This event couldn't go on.  If it is in a group
964                  * then we have to pull the whole group off.
965                  * If the event group is pinned then put it in error state.
966                  */
967                 if (leader != event)
968                         group_sched_out(leader, cpuctx, ctx);
969                 if (leader->attr.pinned) {
970                         update_group_times(leader);
971                         leader->state = PERF_EVENT_STATE_ERROR;
972                 }
973         }
974
975 unlock:
976         raw_spin_unlock(&ctx->lock);
977 }
978
979 /*
980  * Attach a performance event to a context
981  *
982  * First we add the event to the list with the hardware enable bit
983  * in event->hw_config cleared.
984  *
985  * If the event is attached to a task which is on a CPU we use a smp
986  * call to enable it in the task context. The task might have been
987  * scheduled away, but we check this in the smp call again.
988  *
989  * Must be called with ctx->mutex held.
990  */
991 static void
992 perf_install_in_context(struct perf_event_context *ctx,
993                         struct perf_event *event,
994                         int cpu)
995 {
996         struct task_struct *task = ctx->task;
997
998         event->ctx = ctx;
999
1000         if (!task) {
1001                 /*
1002                  * Per cpu events are installed via an smp call and
1003                  * the install is always successful.
1004                  */
1005                 smp_call_function_single(cpu, __perf_install_in_context,
1006                                          event, 1);
1007                 return;
1008         }
1009
1010 retry:
1011         task_oncpu_function_call(task, __perf_install_in_context,
1012                                  event);
1013
1014         raw_spin_lock_irq(&ctx->lock);
1015         /*
1016          * we need to retry the smp call.
1017          */
1018         if (ctx->is_active && list_empty(&event->group_entry)) {
1019                 raw_spin_unlock_irq(&ctx->lock);
1020                 goto retry;
1021         }
1022
1023         /*
1024          * The lock prevents that this context is scheduled in so we
1025          * can add the event safely, if it the call above did not
1026          * succeed.
1027          */
1028         if (list_empty(&event->group_entry))
1029                 add_event_to_ctx(event, ctx);
1030         raw_spin_unlock_irq(&ctx->lock);
1031 }
1032
1033 /*
1034  * Put a event into inactive state and update time fields.
1035  * Enabling the leader of a group effectively enables all
1036  * the group members that aren't explicitly disabled, so we
1037  * have to update their ->tstamp_enabled also.
1038  * Note: this works for group members as well as group leaders
1039  * since the non-leader members' sibling_lists will be empty.
1040  */
1041 static void __perf_event_mark_enabled(struct perf_event *event,
1042                                         struct perf_event_context *ctx)
1043 {
1044         struct perf_event *sub;
1045
1046         event->state = PERF_EVENT_STATE_INACTIVE;
1047         event->tstamp_enabled = ctx->time - event->total_time_enabled;
1048         list_for_each_entry(sub, &event->sibling_list, group_entry) {
1049                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
1050                         sub->tstamp_enabled =
1051                                 ctx->time - sub->total_time_enabled;
1052                 }
1053         }
1054 }
1055
1056 /*
1057  * Cross CPU call to enable a performance event
1058  */
1059 static void __perf_event_enable(void *info)
1060 {
1061         struct perf_event *event = info;
1062         struct perf_event_context *ctx = event->ctx;
1063         struct perf_event *leader = event->group_leader;
1064         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1065         int err;
1066
1067         /*
1068          * If this is a per-task event, need to check whether this
1069          * event's task is the current task on this cpu.
1070          */
1071         if (ctx->task && cpuctx->task_ctx != ctx) {
1072                 if (cpuctx->task_ctx || ctx->task != current)
1073                         return;
1074                 cpuctx->task_ctx = ctx;
1075         }
1076
1077         raw_spin_lock(&ctx->lock);
1078         ctx->is_active = 1;
1079         update_context_time(ctx);
1080
1081         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1082                 goto unlock;
1083         __perf_event_mark_enabled(event, ctx);
1084
1085         if (event->cpu != -1 && event->cpu != smp_processor_id())
1086                 goto unlock;
1087
1088         /*
1089          * If the event is in a group and isn't the group leader,
1090          * then don't put it on unless the group is on.
1091          */
1092         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1093                 goto unlock;
1094
1095         if (!group_can_go_on(event, cpuctx, 1)) {
1096                 err = -EEXIST;
1097         } else {
1098                 if (event == leader)
1099                         err = group_sched_in(event, cpuctx, ctx);
1100                 else
1101                         err = event_sched_in(event, cpuctx, ctx);
1102         }
1103
1104         if (err) {
1105                 /*
1106                  * If this event can't go on and it's part of a
1107                  * group, then the whole group has to come off.
1108                  */
1109                 if (leader != event)
1110                         group_sched_out(leader, cpuctx, ctx);
1111                 if (leader->attr.pinned) {
1112                         update_group_times(leader);
1113                         leader->state = PERF_EVENT_STATE_ERROR;
1114                 }
1115         }
1116
1117 unlock:
1118         raw_spin_unlock(&ctx->lock);
1119 }
1120
1121 /*
1122  * Enable a event.
1123  *
1124  * If event->ctx is a cloned context, callers must make sure that
1125  * every task struct that event->ctx->task could possibly point to
1126  * remains valid.  This condition is satisfied when called through
1127  * perf_event_for_each_child or perf_event_for_each as described
1128  * for perf_event_disable.
1129  */
1130 void perf_event_enable(struct perf_event *event)
1131 {
1132         struct perf_event_context *ctx = event->ctx;
1133         struct task_struct *task = ctx->task;
1134
1135         if (!task) {
1136                 /*
1137                  * Enable the event on the cpu that it's on
1138                  */
1139                 smp_call_function_single(event->cpu, __perf_event_enable,
1140                                          event, 1);
1141                 return;
1142         }
1143
1144         raw_spin_lock_irq(&ctx->lock);
1145         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1146                 goto out;
1147
1148         /*
1149          * If the event is in error state, clear that first.
1150          * That way, if we see the event in error state below, we
1151          * know that it has gone back into error state, as distinct
1152          * from the task having been scheduled away before the
1153          * cross-call arrived.
1154          */
1155         if (event->state == PERF_EVENT_STATE_ERROR)
1156                 event->state = PERF_EVENT_STATE_OFF;
1157
1158 retry:
1159         raw_spin_unlock_irq(&ctx->lock);
1160         task_oncpu_function_call(task, __perf_event_enable, event);
1161
1162         raw_spin_lock_irq(&ctx->lock);
1163
1164         /*
1165          * If the context is active and the event is still off,
1166          * we need to retry the cross-call.
1167          */
1168         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1169                 goto retry;
1170
1171         /*
1172          * Since we have the lock this context can't be scheduled
1173          * in, so we can change the state safely.
1174          */
1175         if (event->state == PERF_EVENT_STATE_OFF)
1176                 __perf_event_mark_enabled(event, ctx);
1177
1178 out:
1179         raw_spin_unlock_irq(&ctx->lock);
1180 }
1181
1182 static int perf_event_refresh(struct perf_event *event, int refresh)
1183 {
1184         /*
1185          * not supported on inherited events
1186          */
1187         if (event->attr.inherit || !is_sampling_event(event))
1188                 return -EINVAL;
1189
1190         atomic_add(refresh, &event->event_limit);
1191         perf_event_enable(event);
1192
1193         return 0;
1194 }
1195
1196 enum event_type_t {
1197         EVENT_FLEXIBLE = 0x1,
1198         EVENT_PINNED = 0x2,
1199         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1200 };
1201
1202 static void ctx_sched_out(struct perf_event_context *ctx,
1203                           struct perf_cpu_context *cpuctx,
1204                           enum event_type_t event_type)
1205 {
1206         struct perf_event *event;
1207
1208         raw_spin_lock(&ctx->lock);
1209         perf_pmu_disable(ctx->pmu);
1210         ctx->is_active = 0;
1211         if (likely(!ctx->nr_events))
1212                 goto out;
1213         update_context_time(ctx);
1214
1215         if (!ctx->nr_active)
1216                 goto out;
1217
1218         if (event_type & EVENT_PINNED) {
1219                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1220                         group_sched_out(event, cpuctx, ctx);
1221         }
1222
1223         if (event_type & EVENT_FLEXIBLE) {
1224                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1225                         group_sched_out(event, cpuctx, ctx);
1226         }
1227 out:
1228         perf_pmu_enable(ctx->pmu);
1229         raw_spin_unlock(&ctx->lock);
1230 }
1231
1232 /*
1233  * Test whether two contexts are equivalent, i.e. whether they
1234  * have both been cloned from the same version of the same context
1235  * and they both have the same number of enabled events.
1236  * If the number of enabled events is the same, then the set
1237  * of enabled events should be the same, because these are both
1238  * inherited contexts, therefore we can't access individual events
1239  * in them directly with an fd; we can only enable/disable all
1240  * events via prctl, or enable/disable all events in a family
1241  * via ioctl, which will have the same effect on both contexts.
1242  */
1243 static int context_equiv(struct perf_event_context *ctx1,
1244                          struct perf_event_context *ctx2)
1245 {
1246         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1247                 && ctx1->parent_gen == ctx2->parent_gen
1248                 && !ctx1->pin_count && !ctx2->pin_count;
1249 }
1250
1251 static void __perf_event_sync_stat(struct perf_event *event,
1252                                      struct perf_event *next_event)
1253 {
1254         u64 value;
1255
1256         if (!event->attr.inherit_stat)
1257                 return;
1258
1259         /*
1260          * Update the event value, we cannot use perf_event_read()
1261          * because we're in the middle of a context switch and have IRQs
1262          * disabled, which upsets smp_call_function_single(), however
1263          * we know the event must be on the current CPU, therefore we
1264          * don't need to use it.
1265          */
1266         switch (event->state) {
1267         case PERF_EVENT_STATE_ACTIVE:
1268                 event->pmu->read(event);
1269                 /* fall-through */
1270
1271         case PERF_EVENT_STATE_INACTIVE:
1272                 update_event_times(event);
1273                 break;
1274
1275         default:
1276                 break;
1277         }
1278
1279         /*
1280          * In order to keep per-task stats reliable we need to flip the event
1281          * values when we flip the contexts.
1282          */
1283         value = local64_read(&next_event->count);
1284         value = local64_xchg(&event->count, value);
1285         local64_set(&next_event->count, value);
1286
1287         swap(event->total_time_enabled, next_event->total_time_enabled);
1288         swap(event->total_time_running, next_event->total_time_running);
1289
1290         /*
1291          * Since we swizzled the values, update the user visible data too.
1292          */
1293         perf_event_update_userpage(event);
1294         perf_event_update_userpage(next_event);
1295 }
1296
1297 #define list_next_entry(pos, member) \
1298         list_entry(pos->member.next, typeof(*pos), member)
1299
1300 static void perf_event_sync_stat(struct perf_event_context *ctx,
1301                                    struct perf_event_context *next_ctx)
1302 {
1303         struct perf_event *event, *next_event;
1304
1305         if (!ctx->nr_stat)
1306                 return;
1307
1308         update_context_time(ctx);
1309
1310         event = list_first_entry(&ctx->event_list,
1311                                    struct perf_event, event_entry);
1312
1313         next_event = list_first_entry(&next_ctx->event_list,
1314                                         struct perf_event, event_entry);
1315
1316         while (&event->event_entry != &ctx->event_list &&
1317                &next_event->event_entry != &next_ctx->event_list) {
1318
1319                 __perf_event_sync_stat(event, next_event);
1320
1321                 event = list_next_entry(event, event_entry);
1322                 next_event = list_next_entry(next_event, event_entry);
1323         }
1324 }
1325
1326 void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1327                                   struct task_struct *next)
1328 {
1329         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1330         struct perf_event_context *next_ctx;
1331         struct perf_event_context *parent;
1332         struct perf_cpu_context *cpuctx;
1333         int do_switch = 1;
1334
1335         if (likely(!ctx))
1336                 return;
1337
1338         cpuctx = __get_cpu_context(ctx);
1339         if (!cpuctx->task_ctx)
1340                 return;
1341
1342         rcu_read_lock();
1343         parent = rcu_dereference(ctx->parent_ctx);
1344         next_ctx = next->perf_event_ctxp[ctxn];
1345         if (parent && next_ctx &&
1346             rcu_dereference(next_ctx->parent_ctx) == parent) {
1347                 /*
1348                  * Looks like the two contexts are clones, so we might be
1349                  * able to optimize the context switch.  We lock both
1350                  * contexts and check that they are clones under the
1351                  * lock (including re-checking that neither has been
1352                  * uncloned in the meantime).  It doesn't matter which
1353                  * order we take the locks because no other cpu could
1354                  * be trying to lock both of these tasks.
1355                  */
1356                 raw_spin_lock(&ctx->lock);
1357                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1358                 if (context_equiv(ctx, next_ctx)) {
1359                         /*
1360                          * XXX do we need a memory barrier of sorts
1361                          * wrt to rcu_dereference() of perf_event_ctxp
1362                          */
1363                         task->perf_event_ctxp[ctxn] = next_ctx;
1364                         next->perf_event_ctxp[ctxn] = ctx;
1365                         ctx->task = next;
1366                         next_ctx->task = task;
1367                         do_switch = 0;
1368
1369                         perf_event_sync_stat(ctx, next_ctx);
1370                 }
1371                 raw_spin_unlock(&next_ctx->lock);
1372                 raw_spin_unlock(&ctx->lock);
1373         }
1374         rcu_read_unlock();
1375
1376         if (do_switch) {
1377                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1378                 cpuctx->task_ctx = NULL;
1379         }
1380 }
1381
1382 #define for_each_task_context_nr(ctxn)                                  \
1383         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1384
1385 /*
1386  * Called from scheduler to remove the events of the current task,
1387  * with interrupts disabled.
1388  *
1389  * We stop each event and update the event value in event->count.
1390  *
1391  * This does not protect us against NMI, but disable()
1392  * sets the disabled bit in the control field of event _before_
1393  * accessing the event control register. If a NMI hits, then it will
1394  * not restart the event.
1395  */
1396 void __perf_event_task_sched_out(struct task_struct *task,
1397                                  struct task_struct *next)
1398 {
1399         int ctxn;
1400
1401         for_each_task_context_nr(ctxn)
1402                 perf_event_context_sched_out(task, ctxn, next);
1403 }
1404
1405 static void task_ctx_sched_out(struct perf_event_context *ctx,
1406                                enum event_type_t event_type)
1407 {
1408         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1409
1410         if (!cpuctx->task_ctx)
1411                 return;
1412
1413         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1414                 return;
1415
1416         ctx_sched_out(ctx, cpuctx, event_type);
1417         cpuctx->task_ctx = NULL;
1418 }
1419
1420 /*
1421  * Called with IRQs disabled
1422  */
1423 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1424                               enum event_type_t event_type)
1425 {
1426         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1427 }
1428
1429 static void
1430 ctx_pinned_sched_in(struct perf_event_context *ctx,
1431                     struct perf_cpu_context *cpuctx)
1432 {
1433         struct perf_event *event;
1434
1435         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1436                 if (event->state <= PERF_EVENT_STATE_OFF)
1437                         continue;
1438                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1439                         continue;
1440
1441                 if (group_can_go_on(event, cpuctx, 1))
1442                         group_sched_in(event, cpuctx, ctx);
1443
1444                 /*
1445                  * If this pinned group hasn't been scheduled,
1446                  * put it in error state.
1447                  */
1448                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1449                         update_group_times(event);
1450                         event->state = PERF_EVENT_STATE_ERROR;
1451                 }
1452         }
1453 }
1454
1455 static void
1456 ctx_flexible_sched_in(struct perf_event_context *ctx,
1457                       struct perf_cpu_context *cpuctx)
1458 {
1459         struct perf_event *event;
1460         int can_add_hw = 1;
1461
1462         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1463                 /* Ignore events in OFF or ERROR state */
1464                 if (event->state <= PERF_EVENT_STATE_OFF)
1465                         continue;
1466                 /*
1467                  * Listen to the 'cpu' scheduling filter constraint
1468                  * of events:
1469                  */
1470                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1471                         continue;
1472
1473                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1474                         if (group_sched_in(event, cpuctx, ctx))
1475                                 can_add_hw = 0;
1476                 }
1477         }
1478 }
1479
1480 static void
1481 ctx_sched_in(struct perf_event_context *ctx,
1482              struct perf_cpu_context *cpuctx,
1483              enum event_type_t event_type)
1484 {
1485         raw_spin_lock(&ctx->lock);
1486         ctx->is_active = 1;
1487         if (likely(!ctx->nr_events))
1488                 goto out;
1489
1490         ctx->timestamp = perf_clock();
1491
1492         /*
1493          * First go through the list and put on any pinned groups
1494          * in order to give them the best chance of going on.
1495          */
1496         if (event_type & EVENT_PINNED)
1497                 ctx_pinned_sched_in(ctx, cpuctx);
1498
1499         /* Then walk through the lower prio flexible groups */
1500         if (event_type & EVENT_FLEXIBLE)
1501                 ctx_flexible_sched_in(ctx, cpuctx);
1502
1503 out:
1504         raw_spin_unlock(&ctx->lock);
1505 }
1506
1507 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1508                              enum event_type_t event_type)
1509 {
1510         struct perf_event_context *ctx = &cpuctx->ctx;
1511
1512         ctx_sched_in(ctx, cpuctx, event_type);
1513 }
1514
1515 static void task_ctx_sched_in(struct perf_event_context *ctx,
1516                               enum event_type_t event_type)
1517 {
1518         struct perf_cpu_context *cpuctx;
1519
1520         cpuctx = __get_cpu_context(ctx);
1521         if (cpuctx->task_ctx == ctx)
1522                 return;
1523
1524         ctx_sched_in(ctx, cpuctx, event_type);
1525         cpuctx->task_ctx = ctx;
1526 }
1527
1528 void perf_event_context_sched_in(struct perf_event_context *ctx)
1529 {
1530         struct perf_cpu_context *cpuctx;
1531
1532         cpuctx = __get_cpu_context(ctx);
1533         if (cpuctx->task_ctx == ctx)
1534                 return;
1535
1536         perf_pmu_disable(ctx->pmu);
1537         /*
1538          * We want to keep the following priority order:
1539          * cpu pinned (that don't need to move), task pinned,
1540          * cpu flexible, task flexible.
1541          */
1542         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1543
1544         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1545         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1546         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1547
1548         cpuctx->task_ctx = ctx;
1549
1550         /*
1551          * Since these rotations are per-cpu, we need to ensure the
1552          * cpu-context we got scheduled on is actually rotating.
1553          */
1554         perf_pmu_rotate_start(ctx->pmu);
1555         perf_pmu_enable(ctx->pmu);
1556 }
1557
1558 /*
1559  * Called from scheduler to add the events of the current task
1560  * with interrupts disabled.
1561  *
1562  * We restore the event value and then enable it.
1563  *
1564  * This does not protect us against NMI, but enable()
1565  * sets the enabled bit in the control field of event _before_
1566  * accessing the event control register. If a NMI hits, then it will
1567  * keep the event running.
1568  */
1569 void __perf_event_task_sched_in(struct task_struct *task)
1570 {
1571         struct perf_event_context *ctx;
1572         int ctxn;
1573
1574         for_each_task_context_nr(ctxn) {
1575                 ctx = task->perf_event_ctxp[ctxn];
1576                 if (likely(!ctx))
1577                         continue;
1578
1579                 perf_event_context_sched_in(ctx);
1580         }
1581 }
1582
1583 #define MAX_INTERRUPTS (~0ULL)
1584
1585 static void perf_log_throttle(struct perf_event *event, int enable);
1586
1587 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1588 {
1589         u64 frequency = event->attr.sample_freq;
1590         u64 sec = NSEC_PER_SEC;
1591         u64 divisor, dividend;
1592
1593         int count_fls, nsec_fls, frequency_fls, sec_fls;
1594
1595         count_fls = fls64(count);
1596         nsec_fls = fls64(nsec);
1597         frequency_fls = fls64(frequency);
1598         sec_fls = 30;
1599
1600         /*
1601          * We got @count in @nsec, with a target of sample_freq HZ
1602          * the target period becomes:
1603          *
1604          *             @count * 10^9
1605          * period = -------------------
1606          *          @nsec * sample_freq
1607          *
1608          */
1609
1610         /*
1611          * Reduce accuracy by one bit such that @a and @b converge
1612          * to a similar magnitude.
1613          */
1614 #define REDUCE_FLS(a, b)                \
1615 do {                                    \
1616         if (a##_fls > b##_fls) {        \
1617                 a >>= 1;                \
1618                 a##_fls--;              \
1619         } else {                        \
1620                 b >>= 1;                \
1621                 b##_fls--;              \
1622         }                               \
1623 } while (0)
1624
1625         /*
1626          * Reduce accuracy until either term fits in a u64, then proceed with
1627          * the other, so that finally we can do a u64/u64 division.
1628          */
1629         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1630                 REDUCE_FLS(nsec, frequency);
1631                 REDUCE_FLS(sec, count);
1632         }
1633
1634         if (count_fls + sec_fls > 64) {
1635                 divisor = nsec * frequency;
1636
1637                 while (count_fls + sec_fls > 64) {
1638                         REDUCE_FLS(count, sec);
1639                         divisor >>= 1;
1640                 }
1641
1642                 dividend = count * sec;
1643         } else {
1644                 dividend = count * sec;
1645
1646                 while (nsec_fls + frequency_fls > 64) {
1647                         REDUCE_FLS(nsec, frequency);
1648                         dividend >>= 1;
1649                 }
1650
1651                 divisor = nsec * frequency;
1652         }
1653
1654         if (!divisor)
1655                 return dividend;
1656
1657         return div64_u64(dividend, divisor);
1658 }
1659
1660 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1661 {
1662         struct hw_perf_event *hwc = &event->hw;
1663         s64 period, sample_period;
1664         s64 delta;
1665
1666         period = perf_calculate_period(event, nsec, count);
1667
1668         delta = (s64)(period - hwc->sample_period);
1669         delta = (delta + 7) / 8; /* low pass filter */
1670
1671         sample_period = hwc->sample_period + delta;
1672
1673         if (!sample_period)
1674                 sample_period = 1;
1675
1676         hwc->sample_period = sample_period;
1677
1678         if (local64_read(&hwc->period_left) > 8*sample_period) {
1679                 event->pmu->stop(event, PERF_EF_UPDATE);
1680                 local64_set(&hwc->period_left, 0);
1681                 event->pmu->start(event, PERF_EF_RELOAD);
1682         }
1683 }
1684
1685 static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1686 {
1687         struct perf_event *event;
1688         struct hw_perf_event *hwc;
1689         u64 interrupts, now;
1690         s64 delta;
1691
1692         raw_spin_lock(&ctx->lock);
1693         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1694                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1695                         continue;
1696
1697                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1698                         continue;
1699
1700                 hwc = &event->hw;
1701
1702                 interrupts = hwc->interrupts;
1703                 hwc->interrupts = 0;
1704
1705                 /*
1706                  * unthrottle events on the tick
1707                  */
1708                 if (interrupts == MAX_INTERRUPTS) {
1709                         perf_log_throttle(event, 1);
1710                         event->pmu->start(event, 0);
1711                 }
1712
1713                 if (!event->attr.freq || !event->attr.sample_freq)
1714                         continue;
1715
1716                 event->pmu->read(event);
1717                 now = local64_read(&event->count);
1718                 delta = now - hwc->freq_count_stamp;
1719                 hwc->freq_count_stamp = now;
1720
1721                 if (delta > 0)
1722                         perf_adjust_period(event, period, delta);
1723         }
1724         raw_spin_unlock(&ctx->lock);
1725 }
1726
1727 /*
1728  * Round-robin a context's events:
1729  */
1730 static void rotate_ctx(struct perf_event_context *ctx)
1731 {
1732         raw_spin_lock(&ctx->lock);
1733
1734         /*
1735          * Rotate the first entry last of non-pinned groups. Rotation might be
1736          * disabled by the inheritance code.
1737          */
1738         if (!ctx->rotate_disable)
1739                 list_rotate_left(&ctx->flexible_groups);
1740
1741         raw_spin_unlock(&ctx->lock);
1742 }
1743
1744 /*
1745  * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1746  * because they're strictly cpu affine and rotate_start is called with IRQs
1747  * disabled, while rotate_context is called from IRQ context.
1748  */
1749 static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1750 {
1751         u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1752         struct perf_event_context *ctx = NULL;
1753         int rotate = 0, remove = 1;
1754
1755         if (cpuctx->ctx.nr_events) {
1756                 remove = 0;
1757                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1758                         rotate = 1;
1759         }
1760
1761         ctx = cpuctx->task_ctx;
1762         if (ctx && ctx->nr_events) {
1763                 remove = 0;
1764                 if (ctx->nr_events != ctx->nr_active)
1765                         rotate = 1;
1766         }
1767
1768         perf_pmu_disable(cpuctx->ctx.pmu);
1769         perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1770         if (ctx)
1771                 perf_ctx_adjust_freq(ctx, interval);
1772
1773         if (!rotate)
1774                 goto done;
1775
1776         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1777         if (ctx)
1778                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1779
1780         rotate_ctx(&cpuctx->ctx);
1781         if (ctx)
1782                 rotate_ctx(ctx);
1783
1784         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1785         if (ctx)
1786                 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1787
1788 done:
1789         if (remove)
1790                 list_del_init(&cpuctx->rotation_list);
1791
1792         perf_pmu_enable(cpuctx->ctx.pmu);
1793 }
1794
1795 void perf_event_task_tick(void)
1796 {
1797         struct list_head *head = &__get_cpu_var(rotation_list);
1798         struct perf_cpu_context *cpuctx, *tmp;
1799
1800         WARN_ON(!irqs_disabled());
1801
1802         list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1803                 if (cpuctx->jiffies_interval == 1 ||
1804                                 !(jiffies % cpuctx->jiffies_interval))
1805                         perf_rotate_context(cpuctx);
1806         }
1807 }
1808
1809 static int event_enable_on_exec(struct perf_event *event,
1810                                 struct perf_event_context *ctx)
1811 {
1812         if (!event->attr.enable_on_exec)
1813                 return 0;
1814
1815         event->attr.enable_on_exec = 0;
1816         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1817                 return 0;
1818
1819         __perf_event_mark_enabled(event, ctx);
1820
1821         return 1;
1822 }
1823
1824 /*
1825  * Enable all of a task's events that have been marked enable-on-exec.
1826  * This expects task == current.
1827  */
1828 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1829 {
1830         struct perf_event *event;
1831         unsigned long flags;
1832         int enabled = 0;
1833         int ret;
1834
1835         local_irq_save(flags);
1836         if (!ctx || !ctx->nr_events)
1837                 goto out;
1838
1839         task_ctx_sched_out(ctx, EVENT_ALL);
1840
1841         raw_spin_lock(&ctx->lock);
1842
1843         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1844                 ret = event_enable_on_exec(event, ctx);
1845                 if (ret)
1846                         enabled = 1;
1847         }
1848
1849         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1850                 ret = event_enable_on_exec(event, ctx);
1851                 if (ret)
1852                         enabled = 1;
1853         }
1854
1855         /*
1856          * Unclone this context if we enabled any event.
1857          */
1858         if (enabled)
1859                 unclone_ctx(ctx);
1860
1861         raw_spin_unlock(&ctx->lock);
1862
1863         perf_event_context_sched_in(ctx);
1864 out:
1865         local_irq_restore(flags);
1866 }
1867
1868 /*
1869  * Cross CPU call to read the hardware event
1870  */
1871 static void __perf_event_read(void *info)
1872 {
1873         struct perf_event *event = info;
1874         struct perf_event_context *ctx = event->ctx;
1875         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1876
1877         /*
1878          * If this is a task context, we need to check whether it is
1879          * the current task context of this cpu.  If not it has been
1880          * scheduled out before the smp call arrived.  In that case
1881          * event->count would have been updated to a recent sample
1882          * when the event was scheduled out.
1883          */
1884         if (ctx->task && cpuctx->task_ctx != ctx)
1885                 return;
1886
1887         raw_spin_lock(&ctx->lock);
1888         update_context_time(ctx);
1889         update_event_times(event);
1890         raw_spin_unlock(&ctx->lock);
1891
1892         event->pmu->read(event);
1893 }
1894
1895 static inline u64 perf_event_count(struct perf_event *event)
1896 {
1897         return local64_read(&event->count) + atomic64_read(&event->child_count);
1898 }
1899
1900 static u64 perf_event_read(struct perf_event *event)
1901 {
1902         /*
1903          * If event is enabled and currently active on a CPU, update the
1904          * value in the event structure:
1905          */
1906         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1907                 smp_call_function_single(event->oncpu,
1908                                          __perf_event_read, event, 1);
1909         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1910                 struct perf_event_context *ctx = event->ctx;
1911                 unsigned long flags;
1912
1913                 raw_spin_lock_irqsave(&ctx->lock, flags);
1914                 /*
1915                  * may read while context is not active
1916                  * (e.g., thread is blocked), in that case
1917                  * we cannot update context time
1918                  */
1919                 if (ctx->is_active)
1920                         update_context_time(ctx);
1921                 update_event_times(event);
1922                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1923         }
1924
1925         return perf_event_count(event);
1926 }
1927
1928 /*
1929  * Callchain support
1930  */
1931
1932 struct callchain_cpus_entries {
1933         struct rcu_head                 rcu_head;
1934         struct perf_callchain_entry     *cpu_entries[0];
1935 };
1936
1937 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1938 static atomic_t nr_callchain_events;
1939 static DEFINE_MUTEX(callchain_mutex);
1940 struct callchain_cpus_entries *callchain_cpus_entries;
1941
1942
1943 __weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1944                                   struct pt_regs *regs)
1945 {
1946 }
1947
1948 __weak void perf_callchain_user(struct perf_callchain_entry *entry,
1949                                 struct pt_regs *regs)
1950 {
1951 }
1952
1953 static void release_callchain_buffers_rcu(struct rcu_head *head)
1954 {
1955         struct callchain_cpus_entries *entries;
1956         int cpu;
1957
1958         entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1959
1960         for_each_possible_cpu(cpu)
1961                 kfree(entries->cpu_entries[cpu]);
1962
1963         kfree(entries);
1964 }
1965
1966 static void release_callchain_buffers(void)
1967 {
1968         struct callchain_cpus_entries *entries;
1969
1970         entries = callchain_cpus_entries;
1971         rcu_assign_pointer(callchain_cpus_entries, NULL);
1972         call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1973 }
1974
1975 static int alloc_callchain_buffers(void)
1976 {
1977         int cpu;
1978         int size;
1979         struct callchain_cpus_entries *entries;
1980
1981         /*
1982          * We can't use the percpu allocation API for data that can be
1983          * accessed from NMI. Use a temporary manual per cpu allocation
1984          * until that gets sorted out.
1985          */
1986         size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1987                 num_possible_cpus();
1988
1989         entries = kzalloc(size, GFP_KERNEL);
1990         if (!entries)
1991                 return -ENOMEM;
1992
1993         size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1994
1995         for_each_possible_cpu(cpu) {
1996                 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1997                                                          cpu_to_node(cpu));
1998                 if (!entries->cpu_entries[cpu])
1999                         goto fail;
2000         }
2001
2002         rcu_assign_pointer(callchain_cpus_entries, entries);
2003
2004         return 0;
2005
2006 fail:
2007         for_each_possible_cpu(cpu)
2008                 kfree(entries->cpu_entries[cpu]);
2009         kfree(entries);
2010
2011         return -ENOMEM;
2012 }
2013
2014 static int get_callchain_buffers(void)
2015 {
2016         int err = 0;
2017         int count;
2018
2019         mutex_lock(&callchain_mutex);
2020
2021         count = atomic_inc_return(&nr_callchain_events);
2022         if (WARN_ON_ONCE(count < 1)) {
2023                 err = -EINVAL;
2024                 goto exit;
2025         }
2026
2027         if (count > 1) {
2028                 /* If the allocation failed, give up */
2029                 if (!callchain_cpus_entries)
2030                         err = -ENOMEM;
2031                 goto exit;
2032         }
2033
2034         err = alloc_callchain_buffers();
2035         if (err)
2036                 release_callchain_buffers();
2037 exit:
2038         mutex_unlock(&callchain_mutex);
2039
2040         return err;
2041 }
2042
2043 static void put_callchain_buffers(void)
2044 {
2045         if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2046                 release_callchain_buffers();
2047                 mutex_unlock(&callchain_mutex);
2048         }
2049 }
2050
2051 static int get_recursion_context(int *recursion)
2052 {
2053         int rctx;
2054
2055         if (in_nmi())
2056                 rctx = 3;
2057         else if (in_irq())
2058                 rctx = 2;
2059         else if (in_softirq())
2060                 rctx = 1;
2061         else
2062                 rctx = 0;
2063
2064         if (recursion[rctx])
2065                 return -1;
2066
2067         recursion[rctx]++;
2068         barrier();
2069
2070         return rctx;
2071 }
2072
2073 static inline void put_recursion_context(int *recursion, int rctx)
2074 {
2075         barrier();
2076         recursion[rctx]--;
2077 }
2078
2079 static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2080 {
2081         int cpu;
2082         struct callchain_cpus_entries *entries;
2083
2084         *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2085         if (*rctx == -1)
2086                 return NULL;
2087
2088         entries = rcu_dereference(callchain_cpus_entries);
2089         if (!entries)
2090                 return NULL;
2091
2092         cpu = smp_processor_id();
2093
2094         return &entries->cpu_entries[cpu][*rctx];
2095 }
2096
2097 static void
2098 put_callchain_entry(int rctx)
2099 {
2100         put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2101 }
2102
2103 static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2104 {
2105         int rctx;
2106         struct perf_callchain_entry *entry;
2107
2108
2109         entry = get_callchain_entry(&rctx);
2110         if (rctx == -1)
2111                 return NULL;
2112
2113         if (!entry)
2114                 goto exit_put;
2115
2116         entry->nr = 0;
2117
2118         if (!user_mode(regs)) {
2119                 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2120                 perf_callchain_kernel(entry, regs);
2121                 if (current->mm)
2122                         regs = task_pt_regs(current);
2123                 else
2124                         regs = NULL;
2125         }
2126
2127         if (regs) {
2128                 perf_callchain_store(entry, PERF_CONTEXT_USER);
2129                 perf_callchain_user(entry, regs);
2130         }
2131
2132 exit_put:
2133         put_callchain_entry(rctx);
2134
2135         return entry;
2136 }
2137
2138 /*
2139  * Initialize the perf_event context in a task_struct:
2140  */
2141 static void __perf_event_init_context(struct perf_event_context *ctx)
2142 {
2143         raw_spin_lock_init(&ctx->lock);
2144         mutex_init(&ctx->mutex);
2145         INIT_LIST_HEAD(&ctx->pinned_groups);
2146         INIT_LIST_HEAD(&ctx->flexible_groups);
2147         INIT_LIST_HEAD(&ctx->event_list);
2148         atomic_set(&ctx->refcount, 1);
2149 }
2150
2151 static struct perf_event_context *
2152 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2153 {
2154         struct perf_event_context *ctx;
2155
2156         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2157         if (!ctx)
2158                 return NULL;
2159
2160         __perf_event_init_context(ctx);
2161         if (task) {
2162                 ctx->task = task;
2163                 get_task_struct(task);
2164         }
2165         ctx->pmu = pmu;
2166
2167         return ctx;
2168 }
2169
2170 static struct task_struct *
2171 find_lively_task_by_vpid(pid_t vpid)
2172 {
2173         struct task_struct *task;
2174         int err;
2175
2176         rcu_read_lock();
2177         if (!vpid)
2178                 task = current;
2179         else
2180                 task = find_task_by_vpid(vpid);
2181         if (task)
2182                 get_task_struct(task);
2183         rcu_read_unlock();
2184
2185         if (!task)
2186                 return ERR_PTR(-ESRCH);
2187
2188         /*
2189          * Can't attach events to a dying task.
2190          */
2191         err = -ESRCH;
2192         if (task->flags & PF_EXITING)
2193                 goto errout;
2194
2195         /* Reuse ptrace permission checks for now. */
2196         err = -EACCES;
2197         if (!ptrace_may_access(task, PTRACE_MODE_READ))
2198                 goto errout;
2199
2200         return task;
2201 errout:
2202         put_task_struct(task);
2203         return ERR_PTR(err);
2204
2205 }
2206
2207 static struct perf_event_context *
2208 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2209 {
2210         struct perf_event_context *ctx;
2211         struct perf_cpu_context *cpuctx;
2212         unsigned long flags;
2213         int ctxn, err;
2214
2215         if (!task && cpu != -1) {
2216                 /* Must be root to operate on a CPU event: */
2217                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2218                         return ERR_PTR(-EACCES);
2219
2220                 if (cpu < 0 || cpu >= nr_cpumask_bits)
2221                         return ERR_PTR(-EINVAL);
2222
2223                 /*
2224                  * We could be clever and allow to attach a event to an
2225                  * offline CPU and activate it when the CPU comes up, but
2226                  * that's for later.
2227                  */
2228                 if (!cpu_online(cpu))
2229                         return ERR_PTR(-ENODEV);
2230
2231                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2232                 ctx = &cpuctx->ctx;
2233                 get_ctx(ctx);
2234
2235                 return ctx;
2236         }
2237
2238         err = -EINVAL;
2239         ctxn = pmu->task_ctx_nr;
2240         if (ctxn < 0)
2241                 goto errout;
2242
2243 retry:
2244         ctx = perf_lock_task_context(task, ctxn, &flags);
2245         if (ctx) {
2246                 unclone_ctx(ctx);
2247                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2248         }
2249
2250         if (!ctx) {
2251                 ctx = alloc_perf_context(pmu, task);
2252                 err = -ENOMEM;
2253                 if (!ctx)
2254                         goto errout;
2255
2256                 get_ctx(ctx);
2257
2258                 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
2259                         /*
2260                          * We raced with some other task; use
2261                          * the context they set.
2262                          */
2263                         put_task_struct(task);
2264                         kfree(ctx);
2265                         goto retry;
2266                 }
2267         }
2268
2269         return ctx;
2270
2271 errout:
2272         return ERR_PTR(err);
2273 }
2274
2275 static void perf_event_free_filter(struct perf_event *event);
2276
2277 static void free_event_rcu(struct rcu_head *head)
2278 {
2279         struct perf_event *event;
2280
2281         event = container_of(head, struct perf_event, rcu_head);
2282         if (event->ns)
2283                 put_pid_ns(event->ns);
2284         perf_event_free_filter(event);
2285         kfree(event);
2286 }
2287
2288 static void perf_buffer_put(struct perf_buffer *buffer);
2289
2290 static void free_event(struct perf_event *event)
2291 {
2292         irq_work_sync(&event->pending);
2293
2294         if (!event->parent) {
2295                 if (event->attach_state & PERF_ATTACH_TASK)
2296                         jump_label_dec(&perf_task_events);
2297                 if (event->attr.mmap || event->attr.mmap_data)
2298                         atomic_dec(&nr_mmap_events);
2299                 if (event->attr.comm)
2300                         atomic_dec(&nr_comm_events);
2301                 if (event->attr.task)
2302                         atomic_dec(&nr_task_events);
2303                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2304                         put_callchain_buffers();
2305         }
2306
2307         if (event->buffer) {
2308                 perf_buffer_put(event->buffer);
2309                 event->buffer = NULL;
2310         }
2311
2312         if (event->destroy)
2313                 event->destroy(event);
2314
2315         if (event->ctx)
2316                 put_ctx(event->ctx);
2317
2318         call_rcu(&event->rcu_head, free_event_rcu);
2319 }
2320
2321 int perf_event_release_kernel(struct perf_event *event)
2322 {
2323         struct perf_event_context *ctx = event->ctx;
2324
2325         /*
2326          * Remove from the PMU, can't get re-enabled since we got
2327          * here because the last ref went.
2328          */
2329         perf_event_disable(event);
2330
2331         WARN_ON_ONCE(ctx->parent_ctx);
2332         /*
2333          * There are two ways this annotation is useful:
2334          *
2335          *  1) there is a lock recursion from perf_event_exit_task
2336          *     see the comment there.
2337          *
2338          *  2) there is a lock-inversion with mmap_sem through
2339          *     perf_event_read_group(), which takes faults while
2340          *     holding ctx->mutex, however this is called after
2341          *     the last filedesc died, so there is no possibility
2342          *     to trigger the AB-BA case.
2343          */
2344         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2345         raw_spin_lock_irq(&ctx->lock);
2346         perf_group_detach(event);
2347         list_del_event(event, ctx);
2348         raw_spin_unlock_irq(&ctx->lock);
2349         mutex_unlock(&ctx->mutex);
2350
2351         free_event(event);
2352
2353         return 0;
2354 }
2355 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2356
2357 /*
2358  * Called when the last reference to the file is gone.
2359  */
2360 static int perf_release(struct inode *inode, struct file *file)
2361 {
2362         struct perf_event *event = file->private_data;
2363         struct task_struct *owner;
2364
2365         file->private_data = NULL;
2366
2367         rcu_read_lock();
2368         owner = ACCESS_ONCE(event->owner);
2369         /*
2370          * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2371          * !owner it means the list deletion is complete and we can indeed
2372          * free this event, otherwise we need to serialize on
2373          * owner->perf_event_mutex.
2374          */
2375         smp_read_barrier_depends();
2376         if (owner) {
2377                 /*
2378                  * Since delayed_put_task_struct() also drops the last
2379                  * task reference we can safely take a new reference
2380                  * while holding the rcu_read_lock().
2381                  */
2382                 get_task_struct(owner);
2383         }
2384         rcu_read_unlock();
2385
2386         if (owner) {
2387                 mutex_lock(&owner->perf_event_mutex);
2388                 /*
2389                  * We have to re-check the event->owner field, if it is cleared
2390                  * we raced with perf_event_exit_task(), acquiring the mutex
2391                  * ensured they're done, and we can proceed with freeing the
2392                  * event.
2393                  */
2394                 if (event->owner)
2395                         list_del_init(&event->owner_entry);
2396                 mutex_unlock(&owner->perf_event_mutex);
2397                 put_task_struct(owner);
2398         }
2399
2400         return perf_event_release_kernel(event);
2401 }
2402
2403 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2404 {
2405         struct perf_event *child;
2406         u64 total = 0;
2407
2408         *enabled = 0;
2409         *running = 0;
2410
2411         mutex_lock(&event->child_mutex);
2412         total += perf_event_read(event);
2413         *enabled += event->total_time_enabled +
2414                         atomic64_read(&event->child_total_time_enabled);
2415         *running += event->total_time_running +
2416                         atomic64_read(&event->child_total_time_running);
2417
2418         list_for_each_entry(child, &event->child_list, child_list) {
2419                 total += perf_event_read(child);
2420                 *enabled += child->total_time_enabled;
2421                 *running += child->total_time_running;
2422         }
2423         mutex_unlock(&event->child_mutex);
2424
2425         return total;
2426 }
2427 EXPORT_SYMBOL_GPL(perf_event_read_value);
2428
2429 static int perf_event_read_group(struct perf_event *event,
2430                                    u64 read_format, char __user *buf)
2431 {
2432         struct perf_event *leader = event->group_leader, *sub;
2433         int n = 0, size = 0, ret = -EFAULT;
2434         struct perf_event_context *ctx = leader->ctx;
2435         u64 values[5];
2436         u64 count, enabled, running;
2437
2438         mutex_lock(&ctx->mutex);
2439         count = perf_event_read_value(leader, &enabled, &running);
2440
2441         values[n++] = 1 + leader->nr_siblings;
2442         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2443                 values[n++] = enabled;
2444         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2445                 values[n++] = running;
2446         values[n++] = count;
2447         if (read_format & PERF_FORMAT_ID)
2448                 values[n++] = primary_event_id(leader);
2449
2450         size = n * sizeof(u64);
2451
2452         if (copy_to_user(buf, values, size))
2453                 goto unlock;
2454
2455         ret = size;
2456
2457         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2458                 n = 0;
2459
2460                 values[n++] = perf_event_read_value(sub, &enabled, &running);
2461                 if (read_format & PERF_FORMAT_ID)
2462                         values[n++] = primary_event_id(sub);
2463
2464                 size = n * sizeof(u64);
2465
2466                 if (copy_to_user(buf + ret, values, size)) {
2467                         ret = -EFAULT;
2468                         goto unlock;
2469                 }
2470
2471                 ret += size;
2472         }
2473 unlock:
2474         mutex_unlock(&ctx->mutex);
2475
2476         return ret;
2477 }
2478
2479 static int perf_event_read_one(struct perf_event *event,
2480                                  u64 read_format, char __user *buf)
2481 {
2482         u64 enabled, running;
2483         u64 values[4];
2484         int n = 0;
2485
2486         values[n++] = perf_event_read_value(event, &enabled, &running);
2487         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2488                 values[n++] = enabled;
2489         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2490                 values[n++] = running;
2491         if (read_format & PERF_FORMAT_ID)
2492                 values[n++] = primary_event_id(event);
2493
2494         if (copy_to_user(buf, values, n * sizeof(u64)))
2495                 return -EFAULT;
2496
2497         return n * sizeof(u64);
2498 }
2499
2500 /*
2501  * Read the performance event - simple non blocking version for now
2502  */
2503 static ssize_t
2504 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2505 {
2506         u64 read_format = event->attr.read_format;
2507         int ret;
2508
2509         /*
2510          * Return end-of-file for a read on a event that is in
2511          * error state (i.e. because it was pinned but it couldn't be
2512          * scheduled on to the CPU at some point).
2513          */
2514         if (event->state == PERF_EVENT_STATE_ERROR)
2515                 return 0;
2516
2517         if (count < event->read_size)
2518                 return -ENOSPC;
2519
2520         WARN_ON_ONCE(event->ctx->parent_ctx);
2521         if (read_format & PERF_FORMAT_GROUP)
2522                 ret = perf_event_read_group(event, read_format, buf);
2523         else
2524                 ret = perf_event_read_one(event, read_format, buf);
2525
2526         return ret;
2527 }
2528
2529 static ssize_t
2530 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2531 {
2532         struct perf_event *event = file->private_data;
2533
2534         return perf_read_hw(event, buf, count);
2535 }
2536
2537 static unsigned int perf_poll(struct file *file, poll_table *wait)
2538 {
2539         struct perf_event *event = file->private_data;
2540         struct perf_buffer *buffer;
2541         unsigned int events = POLL_HUP;
2542
2543         rcu_read_lock();
2544         buffer = rcu_dereference(event->buffer);
2545         if (buffer)
2546                 events = atomic_xchg(&buffer->poll, 0);
2547         rcu_read_unlock();
2548
2549         poll_wait(file, &event->waitq, wait);
2550
2551         return events;
2552 }
2553
2554 static void perf_event_reset(struct perf_event *event)
2555 {
2556         (void)perf_event_read(event);
2557         local64_set(&event->count, 0);
2558         perf_event_update_userpage(event);
2559 }
2560
2561 /*
2562  * Holding the top-level event's child_mutex means that any
2563  * descendant process that has inherited this event will block
2564  * in sync_child_event if it goes to exit, thus satisfying the
2565  * task existence requirements of perf_event_enable/disable.
2566  */
2567 static void perf_event_for_each_child(struct perf_event *event,
2568                                         void (*func)(struct perf_event *))
2569 {
2570         struct perf_event *child;
2571
2572         WARN_ON_ONCE(event->ctx->parent_ctx);
2573         mutex_lock(&event->child_mutex);
2574         func(event);
2575         list_for_each_entry(child, &event->child_list, child_list)
2576                 func(child);
2577         mutex_unlock(&event->child_mutex);
2578 }
2579
2580 static void perf_event_for_each(struct perf_event *event,
2581                                   void (*func)(struct perf_event *))
2582 {
2583         struct perf_event_context *ctx = event->ctx;
2584         struct perf_event *sibling;
2585
2586         WARN_ON_ONCE(ctx->parent_ctx);
2587         mutex_lock(&ctx->mutex);
2588         event = event->group_leader;
2589
2590         perf_event_for_each_child(event, func);
2591         func(event);
2592         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2593                 perf_event_for_each_child(event, func);
2594         mutex_unlock(&ctx->mutex);
2595 }
2596
2597 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2598 {
2599         struct perf_event_context *ctx = event->ctx;
2600         int ret = 0;
2601         u64 value;
2602
2603         if (!is_sampling_event(event))
2604                 return -EINVAL;
2605
2606         if (copy_from_user(&value, arg, sizeof(value)))
2607                 return -EFAULT;
2608
2609         if (!value)
2610                 return -EINVAL;
2611
2612         raw_spin_lock_irq(&ctx->lock);
2613         if (event->attr.freq) {
2614                 if (value > sysctl_perf_event_sample_rate) {
2615                         ret = -EINVAL;
2616                         goto unlock;
2617                 }
2618
2619                 event->attr.sample_freq = value;
2620         } else {
2621                 event->attr.sample_period = value;
2622                 event->hw.sample_period = value;
2623         }
2624 unlock:
2625         raw_spin_unlock_irq(&ctx->lock);
2626
2627         return ret;
2628 }
2629
2630 static const struct file_operations perf_fops;
2631
2632 static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2633 {
2634         struct file *file;
2635
2636         file = fget_light(fd, fput_needed);
2637         if (!file)
2638                 return ERR_PTR(-EBADF);
2639
2640         if (file->f_op != &perf_fops) {
2641                 fput_light(file, *fput_needed);
2642                 *fput_needed = 0;
2643                 return ERR_PTR(-EBADF);
2644         }
2645
2646         return file->private_data;
2647 }
2648
2649 static int perf_event_set_output(struct perf_event *event,
2650                                  struct perf_event *output_event);
2651 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2652
2653 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2654 {
2655         struct perf_event *event = file->private_data;
2656         void (*func)(struct perf_event *);
2657         u32 flags = arg;
2658
2659         switch (cmd) {
2660         case PERF_EVENT_IOC_ENABLE:
2661                 func = perf_event_enable;
2662                 break;
2663         case PERF_EVENT_IOC_DISABLE:
2664                 func = perf_event_disable;
2665                 break;
2666         case PERF_EVENT_IOC_RESET:
2667                 func = perf_event_reset;
2668                 break;
2669
2670         case PERF_EVENT_IOC_REFRESH:
2671                 return perf_event_refresh(event, arg);
2672
2673         case PERF_EVENT_IOC_PERIOD:
2674                 return perf_event_period(event, (u64 __user *)arg);
2675
2676         case PERF_EVENT_IOC_SET_OUTPUT:
2677         {
2678                 struct perf_event *output_event = NULL;
2679                 int fput_needed = 0;
2680                 int ret;
2681
2682                 if (arg != -1) {
2683                         output_event = perf_fget_light(arg, &fput_needed);
2684                         if (IS_ERR(output_event))
2685                                 return PTR_ERR(output_event);
2686                 }
2687
2688                 ret = perf_event_set_output(event, output_event);
2689                 if (output_event)
2690                         fput_light(output_event->filp, fput_needed);
2691
2692                 return ret;
2693         }
2694
2695         case PERF_EVENT_IOC_SET_FILTER:
2696                 return perf_event_set_filter(event, (void __user *)arg);
2697
2698         default:
2699                 return -ENOTTY;
2700         }
2701
2702         if (flags & PERF_IOC_FLAG_GROUP)
2703                 perf_event_for_each(event, func);
2704         else
2705                 perf_event_for_each_child(event, func);
2706
2707         return 0;
2708 }
2709
2710 int perf_event_task_enable(void)
2711 {
2712         struct perf_event *event;
2713
2714         mutex_lock(&current->perf_event_mutex);
2715         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2716                 perf_event_for_each_child(event, perf_event_enable);
2717         mutex_unlock(&current->perf_event_mutex);
2718
2719         return 0;
2720 }
2721
2722 int perf_event_task_disable(void)
2723 {
2724         struct perf_event *event;
2725
2726         mutex_lock(&current->perf_event_mutex);
2727         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2728                 perf_event_for_each_child(event, perf_event_disable);
2729         mutex_unlock(&current->perf_event_mutex);
2730
2731         return 0;
2732 }
2733
2734 #ifndef PERF_EVENT_INDEX_OFFSET
2735 # define PERF_EVENT_INDEX_OFFSET 0
2736 #endif
2737
2738 static int perf_event_index(struct perf_event *event)
2739 {
2740         if (event->hw.state & PERF_HES_STOPPED)
2741                 return 0;
2742
2743         if (event->state != PERF_EVENT_STATE_ACTIVE)
2744                 return 0;
2745
2746         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2747 }
2748
2749 /*
2750  * Callers need to ensure there can be no nesting of this function, otherwise
2751  * the seqlock logic goes bad. We can not serialize this because the arch
2752  * code calls this from NMI context.
2753  */
2754 void perf_event_update_userpage(struct perf_event *event)
2755 {
2756         struct perf_event_mmap_page *userpg;
2757         struct perf_buffer *buffer;
2758
2759         rcu_read_lock();
2760         buffer = rcu_dereference(event->buffer);
2761         if (!buffer)
2762                 goto unlock;
2763
2764         userpg = buffer->user_page;
2765
2766         /*
2767          * Disable preemption so as to not let the corresponding user-space
2768          * spin too long if we get preempted.
2769          */
2770         preempt_disable();
2771         ++userpg->lock;
2772         barrier();
2773         userpg->index = perf_event_index(event);
2774         userpg->offset = perf_event_count(event);
2775         if (event->state == PERF_EVENT_STATE_ACTIVE)
2776                 userpg->offset -= local64_read(&event->hw.prev_count);
2777
2778         userpg->time_enabled = event->total_time_enabled +
2779                         atomic64_read(&event->child_total_time_enabled);
2780
2781         userpg->time_running = event->total_time_running +
2782                         atomic64_read(&event->child_total_time_running);
2783
2784         barrier();
2785         ++userpg->lock;
2786         preempt_enable();
2787 unlock:
2788         rcu_read_unlock();
2789 }
2790
2791 static unsigned long perf_data_size(struct perf_buffer *buffer);
2792
2793 static void
2794 perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
2795 {
2796         long max_size = perf_data_size(buffer);
2797
2798         if (watermark)
2799                 buffer->watermark = min(max_size, watermark);
2800
2801         if (!buffer->watermark)
2802                 buffer->watermark = max_size / 2;
2803
2804         if (flags & PERF_BUFFER_WRITABLE)
2805                 buffer->writable = 1;
2806
2807         atomic_set(&buffer->refcount, 1);
2808 }
2809
2810 #ifndef CONFIG_PERF_USE_VMALLOC
2811
2812 /*
2813  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2814  */
2815
2816 static struct page *
2817 perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2818 {
2819         if (pgoff > buffer->nr_pages)
2820                 return NULL;
2821
2822         if (pgoff == 0)
2823                 return virt_to_page(buffer->user_page);
2824
2825         return virt_to_page(buffer->data_pages[pgoff - 1]);
2826 }
2827
2828 static void *perf_mmap_alloc_page(int cpu)
2829 {
2830         struct page *page;
2831         int node;
2832
2833         node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2834         page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2835         if (!page)
2836                 return NULL;
2837
2838         return page_address(page);
2839 }
2840
2841 static struct perf_buffer *
2842 perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2843 {
2844         struct perf_buffer *buffer;
2845         unsigned long size;
2846         int i;
2847
2848         size = sizeof(struct perf_buffer);
2849         size += nr_pages * sizeof(void *);
2850
2851         buffer = kzalloc(size, GFP_KERNEL);
2852         if (!buffer)
2853                 goto fail;
2854
2855         buffer->user_page = perf_mmap_alloc_page(cpu);
2856         if (!buffer->user_page)
2857                 goto fail_user_page;
2858
2859         for (i = 0; i < nr_pages; i++) {
2860                 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
2861                 if (!buffer->data_pages[i])
2862                         goto fail_data_pages;
2863         }
2864
2865         buffer->nr_pages = nr_pages;
2866
2867         perf_buffer_init(buffer, watermark, flags);
2868
2869         return buffer;
2870
2871 fail_data_pages:
2872         for (i--; i >= 0; i--)
2873                 free_page((unsigned long)buffer->data_pages[i]);
2874
2875         free_page((unsigned long)buffer->user_page);
2876
2877 fail_user_page:
2878         kfree(buffer);
2879
2880 fail:
2881         return NULL;
2882 }
2883
2884 static void perf_mmap_free_page(unsigned long addr)
2885 {
2886         struct page *page = virt_to_page((void *)addr);
2887
2888         page->mapping = NULL;
2889         __free_page(page);
2890 }
2891
2892 static void perf_buffer_free(struct perf_buffer *buffer)
2893 {
2894         int i;
2895
2896         perf_mmap_free_page((unsigned long)buffer->user_page);
2897         for (i = 0; i < buffer->nr_pages; i++)
2898                 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
2899         kfree(buffer);
2900 }
2901
2902 static inline int page_order(struct perf_buffer *buffer)
2903 {
2904         return 0;
2905 }
2906
2907 #else
2908
2909 /*
2910  * Back perf_mmap() with vmalloc memory.
2911  *
2912  * Required for architectures that have d-cache aliasing issues.
2913  */
2914
2915 static inline int page_order(struct perf_buffer *buffer)
2916 {
2917         return buffer->page_order;
2918 }
2919
2920 static struct page *
2921 perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
2922 {
2923         if (pgoff > (1UL << page_order(buffer)))
2924                 return NULL;
2925
2926         return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
2927 }
2928
2929 static void perf_mmap_unmark_page(void *addr)
2930 {
2931         struct page *page = vmalloc_to_page(addr);
2932
2933         page->mapping = NULL;
2934 }
2935
2936 static void perf_buffer_free_work(struct work_struct *work)
2937 {
2938         struct perf_buffer *buffer;
2939         void *base;
2940         int i, nr;
2941
2942         buffer = container_of(work, struct perf_buffer, work);
2943         nr = 1 << page_order(buffer);
2944
2945         base = buffer->user_page;
2946         for (i = 0; i < nr + 1; i++)
2947                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2948
2949         vfree(base);
2950         kfree(buffer);
2951 }
2952
2953 static void perf_buffer_free(struct perf_buffer *buffer)
2954 {
2955         schedule_work(&buffer->work);
2956 }
2957
2958 static struct perf_buffer *
2959 perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
2960 {
2961         struct perf_buffer *buffer;
2962         unsigned long size;
2963         void *all_buf;
2964
2965         size = sizeof(struct perf_buffer);
2966         size += sizeof(void *);
2967
2968         buffer = kzalloc(size, GFP_KERNEL);
2969         if (!buffer)
2970                 goto fail;
2971
2972         INIT_WORK(&buffer->work, perf_buffer_free_work);
2973
2974         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2975         if (!all_buf)
2976                 goto fail_all_buf;
2977
2978         buffer->user_page = all_buf;
2979         buffer->data_pages[0] = all_buf + PAGE_SIZE;
2980         buffer->page_order = ilog2(nr_pages);
2981         buffer->nr_pages = 1;
2982
2983         perf_buffer_init(buffer, watermark, flags);
2984
2985         return buffer;
2986
2987 fail_all_buf:
2988         kfree(buffer);
2989
2990 fail:
2991         return NULL;
2992 }
2993
2994 #endif
2995
2996 static unsigned long perf_data_size(struct perf_buffer *buffer)
2997 {
2998         return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
2999 }
3000
3001 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3002 {
3003         struct perf_event *event = vma->vm_file->private_data;
3004         struct perf_buffer *buffer;
3005         int ret = VM_FAULT_SIGBUS;
3006
3007         if (vmf->flags & FAULT_FLAG_MKWRITE) {
3008                 if (vmf->pgoff == 0)
3009                         ret = 0;
3010                 return ret;
3011         }
3012
3013         rcu_read_lock();
3014         buffer = rcu_dereference(event->buffer);
3015         if (!buffer)
3016                 goto unlock;
3017
3018         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3019                 goto unlock;
3020
3021         vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
3022         if (!vmf->page)
3023                 goto unlock;
3024
3025         get_page(vmf->page);
3026         vmf->page->mapping = vma->vm_file->f_mapping;
3027         vmf->page->index   = vmf->pgoff;
3028
3029         ret = 0;
3030 unlock:
3031         rcu_read_unlock();
3032
3033         return ret;
3034 }
3035
3036 static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
3037 {
3038         struct perf_buffer *buffer;
3039
3040         buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
3041         perf_buffer_free(buffer);
3042 }
3043
3044 static struct perf_buffer *perf_buffer_get(struct perf_event *event)
3045 {
3046         struct perf_buffer *buffer;
3047
3048         rcu_read_lock();
3049         buffer = rcu_dereference(event->buffer);
3050         if (buffer) {
3051                 if (!atomic_inc_not_zero(&buffer->refcount))
3052                         buffer = NULL;
3053         }
3054         rcu_read_unlock();
3055
3056         return buffer;
3057 }
3058
3059 static void perf_buffer_put(struct perf_buffer *buffer)
3060 {
3061         if (!atomic_dec_and_test(&buffer->refcount))
3062                 return;
3063
3064         call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
3065 }
3066
3067 static void perf_mmap_open(struct vm_area_struct *vma)
3068 {
3069         struct perf_event *event = vma->vm_file->private_data;
3070
3071         atomic_inc(&event->mmap_count);
3072 }
3073
3074 static void perf_mmap_close(struct vm_area_struct *vma)
3075 {
3076         struct perf_event *event = vma->vm_file->private_data;
3077
3078         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3079                 unsigned long size = perf_data_size(event->buffer);
3080                 struct user_struct *user = event->mmap_user;
3081                 struct perf_buffer *buffer = event->buffer;
3082
3083                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3084                 vma->vm_mm->locked_vm -= event->mmap_locked;
3085                 rcu_assign_pointer(event->buffer, NULL);
3086                 mutex_unlock(&event->mmap_mutex);
3087
3088                 perf_buffer_put(buffer);
3089                 free_uid(user);
3090         }
3091 }
3092
3093 static const struct vm_operations_struct perf_mmap_vmops = {
3094         .open           = perf_mmap_open,
3095         .close          = perf_mmap_close,
3096         .fault          = perf_mmap_fault,
3097         .page_mkwrite   = perf_mmap_fault,
3098 };
3099
3100 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3101 {
3102         struct perf_event *event = file->private_data;
3103         unsigned long user_locked, user_lock_limit;
3104         struct user_struct *user = current_user();
3105         unsigned long locked, lock_limit;
3106         struct perf_buffer *buffer;
3107         unsigned long vma_size;
3108         unsigned long nr_pages;
3109         long user_extra, extra;
3110         int ret = 0, flags = 0;
3111
3112         /*
3113          * Don't allow mmap() of inherited per-task counters. This would
3114          * create a performance issue due to all children writing to the
3115          * same buffer.
3116          */
3117         if (event->cpu == -1 && event->attr.inherit)
3118                 return -EINVAL;
3119
3120         if (!(vma->vm_flags & VM_SHARED))
3121                 return -EINVAL;
3122
3123         vma_size = vma->vm_end - vma->vm_start;
3124         nr_pages = (vma_size / PAGE_SIZE) - 1;
3125
3126         /*
3127          * If we have buffer pages ensure they're a power-of-two number, so we
3128          * can do bitmasks instead of modulo.
3129          */
3130         if (nr_pages != 0 && !is_power_of_2(nr_pages))
3131                 return -EINVAL;
3132
3133         if (vma_size != PAGE_SIZE * (1 + nr_pages))
3134                 return -EINVAL;
3135
3136         if (vma->vm_pgoff != 0)
3137                 return -EINVAL;
3138
3139         WARN_ON_ONCE(event->ctx->parent_ctx);
3140         mutex_lock(&event->mmap_mutex);
3141         if (event->buffer) {
3142                 if (event->buffer->nr_pages == nr_pages)
3143                         atomic_inc(&event->buffer->refcount);
3144                 else
3145                         ret = -EINVAL;
3146                 goto unlock;
3147         }
3148
3149         user_extra = nr_pages + 1;
3150         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3151
3152         /*
3153          * Increase the limit linearly with more CPUs:
3154          */
3155         user_lock_limit *= num_online_cpus();
3156
3157         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3158
3159         extra = 0;
3160         if (user_locked > user_lock_limit)
3161                 extra = user_locked - user_lock_limit;
3162
3163         lock_limit = rlimit(RLIMIT_MEMLOCK);
3164         lock_limit >>= PAGE_SHIFT;
3165         locked = vma->vm_mm->locked_vm + extra;
3166
3167         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3168                 !capable(CAP_IPC_LOCK)) {
3169                 ret = -EPERM;
3170                 goto unlock;
3171         }
3172
3173         WARN_ON(event->buffer);
3174
3175         if (vma->vm_flags & VM_WRITE)
3176                 flags |= PERF_BUFFER_WRITABLE;
3177
3178         buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
3179                                    event->cpu, flags);
3180         if (!buffer) {
3181                 ret = -ENOMEM;
3182                 goto unlock;
3183         }
3184         rcu_assign_pointer(event->buffer, buffer);
3185
3186         atomic_long_add(user_extra, &user->locked_vm);
3187         event->mmap_locked = extra;
3188         event->mmap_user = get_current_user();
3189         vma->vm_mm->locked_vm += event->mmap_locked;
3190
3191 unlock:
3192         if (!ret)
3193                 atomic_inc(&event->mmap_count);
3194         mutex_unlock(&event->mmap_mutex);
3195
3196         vma->vm_flags |= VM_RESERVED;
3197         vma->vm_ops = &perf_mmap_vmops;
3198
3199         return ret;
3200 }
3201
3202 static int perf_fasync(int fd, struct file *filp, int on)
3203 {
3204         struct inode *inode = filp->f_path.dentry->d_inode;
3205         struct perf_event *event = filp->private_data;
3206         int retval;
3207
3208         mutex_lock(&inode->i_mutex);
3209         retval = fasync_helper(fd, filp, on, &event->fasync);
3210         mutex_unlock(&inode->i_mutex);
3211
3212         if (retval < 0)
3213                 return retval;
3214
3215         return 0;
3216 }
3217
3218 static const struct file_operations perf_fops = {
3219         .llseek                 = no_llseek,
3220         .release                = perf_release,
3221         .read                   = perf_read,
3222         .poll                   = perf_poll,
3223         .unlocked_ioctl         = perf_ioctl,
3224         .compat_ioctl           = perf_ioctl,
3225         .mmap                   = perf_mmap,
3226         .fasync                 = perf_fasync,
3227 };
3228
3229 /*
3230  * Perf event wakeup
3231  *
3232  * If there's data, ensure we set the poll() state and publish everything
3233  * to user-space before waking everybody up.
3234  */
3235
3236 void perf_event_wakeup(struct perf_event *event)
3237 {
3238         wake_up_all(&event->waitq);
3239
3240         if (event->pending_kill) {
3241                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3242                 event->pending_kill = 0;
3243         }
3244 }
3245
3246 static void perf_pending_event(struct irq_work *entry)
3247 {
3248         struct perf_event *event = container_of(entry,
3249                         struct perf_event, pending);
3250
3251         if (event->pending_disable) {
3252                 event->pending_disable = 0;
3253                 __perf_event_disable(event);
3254         }
3255
3256         if (event->pending_wakeup) {
3257                 event->pending_wakeup = 0;
3258                 perf_event_wakeup(event);
3259         }
3260 }
3261
3262 /*
3263  * We assume there is only KVM supporting the callbacks.
3264  * Later on, we might change it to a list if there is
3265  * another virtualization implementation supporting the callbacks.
3266  */
3267 struct perf_guest_info_callbacks *perf_guest_cbs;
3268
3269 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3270 {
3271         perf_guest_cbs = cbs;
3272         return 0;
3273 }
3274 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3275
3276 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3277 {
3278         perf_guest_cbs = NULL;
3279         return 0;
3280 }
3281 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3282
3283 /*
3284  * Output
3285  */
3286 static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3287                               unsigned long offset, unsigned long head)
3288 {
3289         unsigned long mask;
3290
3291         if (!buffer->writable)
3292                 return true;
3293
3294         mask = perf_data_size(buffer) - 1;
3295
3296         offset = (offset - tail) & mask;
3297         head   = (head   - tail) & mask;
3298
3299         if ((int)(head - offset) < 0)
3300                 return false;
3301
3302         return true;
3303 }
3304
3305 static void perf_output_wakeup(struct perf_output_handle *handle)
3306 {
3307         atomic_set(&handle->buffer->poll, POLL_IN);
3308
3309         if (handle->nmi) {
3310                 handle->event->pending_wakeup = 1;
3311                 irq_work_queue(&handle->event->pending);
3312         } else
3313                 perf_event_wakeup(handle->event);
3314 }
3315
3316 /*
3317  * We need to ensure a later event_id doesn't publish a head when a former
3318  * event isn't done writing. However since we need to deal with NMIs we
3319  * cannot fully serialize things.
3320  *
3321  * We only publish the head (and generate a wakeup) when the outer-most
3322  * event completes.
3323  */
3324 static void perf_output_get_handle(struct perf_output_handle *handle)
3325 {
3326         struct perf_buffer *buffer = handle->buffer;
3327
3328         preempt_disable();
3329         local_inc(&buffer->nest);
3330         handle->wakeup = local_read(&buffer->wakeup);
3331 }
3332
3333 static void perf_output_put_handle(struct perf_output_handle *handle)
3334 {
3335         struct perf_buffer *buffer = handle->buffer;
3336         unsigned long head;
3337
3338 again:
3339         head = local_read(&buffer->head);
3340
3341         /*
3342          * IRQ/NMI can happen here, which means we can miss a head update.
3343          */
3344
3345         if (!local_dec_and_test(&buffer->nest))
3346                 goto out;
3347
3348         /*
3349          * Publish the known good head. Rely on the full barrier implied
3350          * by atomic_dec_and_test() order the buffer->head read and this
3351          * write.
3352          */
3353         buffer->user_page->data_head = head;
3354
3355         /*
3356          * Now check if we missed an update, rely on the (compiler)
3357          * barrier in atomic_dec_and_test() to re-read buffer->head.
3358          */
3359         if (unlikely(head != local_read(&buffer->head))) {
3360                 local_inc(&buffer->nest);
3361                 goto again;
3362         }
3363
3364         if (handle->wakeup != local_read(&buffer->wakeup))
3365                 perf_output_wakeup(handle);
3366
3367 out:
3368         preempt_enable();
3369 }
3370
3371 __always_inline void perf_output_copy(struct perf_output_handle *handle,
3372                       const void *buf, unsigned int len)
3373 {
3374         do {
3375                 unsigned long size = min_t(unsigned long, handle->size, len);
3376
3377                 memcpy(handle->addr, buf, size);
3378
3379                 len -= size;
3380                 handle->addr += size;
3381                 buf += size;
3382                 handle->size -= size;
3383                 if (!handle->size) {
3384                         struct perf_buffer *buffer = handle->buffer;
3385
3386                         handle->page++;
3387                         handle->page &= buffer->nr_pages - 1;
3388                         handle->addr = buffer->data_pages[handle->page];
3389                         handle->size = PAGE_SIZE << page_order(buffer);
3390                 }
3391         } while (len);
3392 }
3393
3394 static void __perf_event_header__init_id(struct perf_event_header *header,
3395                                          struct perf_sample_data *data,
3396                                          struct perf_event *event)
3397 {
3398         u64 sample_type = event->attr.sample_type;
3399
3400         data->type = sample_type;
3401         header->size += event->id_header_size;
3402
3403         if (sample_type & PERF_SAMPLE_TID) {
3404                 /* namespace issues */
3405                 data->tid_entry.pid = perf_event_pid(event, current);
3406                 data->tid_entry.tid = perf_event_tid(event, current);
3407         }
3408
3409         if (sample_type & PERF_SAMPLE_TIME)
3410                 data->time = perf_clock();
3411
3412         if (sample_type & PERF_SAMPLE_ID)
3413                 data->id = primary_event_id(event);
3414
3415         if (sample_type & PERF_SAMPLE_STREAM_ID)
3416                 data->stream_id = event->id;
3417
3418         if (sample_type & PERF_SAMPLE_CPU) {
3419                 data->cpu_entry.cpu      = raw_smp_processor_id();
3420                 data->cpu_entry.reserved = 0;
3421         }
3422 }
3423
3424 static void perf_event_header__init_id(struct perf_event_header *header,
3425                                        struct perf_sample_data *data,
3426                                        struct perf_event *event)
3427 {
3428         if (event->attr.sample_id_all)
3429                 __perf_event_header__init_id(header, data, event);
3430 }
3431
3432 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3433                                            struct perf_sample_data *data)
3434 {
3435         u64 sample_type = data->type;
3436
3437         if (sample_type & PERF_SAMPLE_TID)
3438                 perf_output_put(handle, data->tid_entry);
3439
3440         if (sample_type & PERF_SAMPLE_TIME)
3441                 perf_output_put(handle, data->time);
3442
3443         if (sample_type & PERF_SAMPLE_ID)
3444                 perf_output_put(handle, data->id);
3445
3446         if (sample_type & PERF_SAMPLE_STREAM_ID)
3447                 perf_output_put(handle, data->stream_id);
3448
3449         if (sample_type & PERF_SAMPLE_CPU)
3450                 perf_output_put(handle, data->cpu_entry);
3451 }
3452
3453 static void perf_event__output_id_sample(struct perf_event *event,
3454                                          struct perf_output_handle *handle,
3455                                          struct perf_sample_data *sample)
3456 {
3457         if (event->attr.sample_id_all)
3458                 __perf_event__output_id_sample(handle, sample);
3459 }
3460
3461 int perf_output_begin(struct perf_output_handle *handle,
3462                       struct perf_event *event, unsigned int size,
3463                       int nmi, int sample)
3464 {
3465         struct perf_buffer *buffer;
3466         unsigned long tail, offset, head;
3467         int have_lost;
3468         struct perf_sample_data sample_data;
3469         struct {
3470                 struct perf_event_header header;
3471                 u64                      id;
3472                 u64                      lost;
3473         } lost_event;
3474
3475         rcu_read_lock();
3476         /*
3477          * For inherited events we send all the output towards the parent.
3478          */
3479         if (event->parent)
3480                 event = event->parent;
3481
3482         buffer = rcu_dereference(event->buffer);
3483         if (!buffer)
3484                 goto out;
3485
3486         handle->buffer  = buffer;
3487         handle->event   = event;
3488         handle->nmi     = nmi;
3489         handle->sample  = sample;
3490
3491         if (!buffer->nr_pages)
3492                 goto out;
3493
3494         have_lost = local_read(&buffer->lost);
3495         if (have_lost) {
3496                 lost_event.header.size = sizeof(lost_event);
3497                 perf_event_header__init_id(&lost_event.header, &sample_data,
3498                                            event);
3499                 size += lost_event.header.size;
3500         }
3501
3502         perf_output_get_handle(handle);
3503
3504         do {
3505                 /*
3506                  * Userspace could choose to issue a mb() before updating the
3507                  * tail pointer. So that all reads will be completed before the
3508                  * write is issued.
3509                  */
3510                 tail = ACCESS_ONCE(buffer->user_page->data_tail);
3511                 smp_rmb();
3512                 offset = head = local_read(&buffer->head);
3513                 head += size;
3514                 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
3515                         goto fail;
3516         } while (local_cmpxchg(&buffer->head, offset, head) != offset);
3517
3518         if (head - local_read(&buffer->wakeup) > buffer->watermark)
3519                 local_add(buffer->watermark, &buffer->wakeup);
3520
3521         handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
3522         handle->page &= buffer->nr_pages - 1;
3523         handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
3524         handle->addr = buffer->data_pages[handle->page];
3525         handle->addr += handle->size;
3526         handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
3527
3528         if (have_lost) {
3529                 lost_event.header.type = PERF_RECORD_LOST;
3530                 lost_event.header.misc = 0;
3531                 lost_event.id          = event->id;
3532                 lost_event.lost        = local_xchg(&buffer->lost, 0);
3533
3534                 perf_output_put(handle, lost_event);
3535                 perf_event__output_id_sample(event, handle, &sample_data);
3536         }
3537
3538         return 0;
3539
3540 fail:
3541         local_inc(&buffer->lost);
3542         perf_output_put_handle(handle);
3543 out:
3544         rcu_read_unlock();
3545
3546         return -ENOSPC;
3547 }
3548
3549 void perf_output_end(struct perf_output_handle *handle)
3550 {
3551         struct perf_event *event = handle->event;
3552         struct perf_buffer *buffer = handle->buffer;
3553
3554         int wakeup_events = event->attr.wakeup_events;
3555
3556         if (handle->sample && wakeup_events) {
3557                 int events = local_inc_return(&buffer->events);
3558                 if (events >= wakeup_events) {
3559                         local_sub(wakeup_events, &buffer->events);
3560                         local_inc(&buffer->wakeup);
3561                 }
3562         }
3563
3564         perf_output_put_handle(handle);
3565         rcu_read_unlock();
3566 }
3567
3568 static void perf_output_read_one(struct perf_output_handle *handle,
3569                                  struct perf_event *event,
3570                                  u64 enabled, u64 running)
3571 {
3572         u64 read_format = event->attr.read_format;
3573         u64 values[4];
3574         int n = 0;
3575
3576         values[n++] = perf_event_count(event);
3577         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3578                 values[n++] = enabled +
3579                         atomic64_read(&event->child_total_time_enabled);
3580         }
3581         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3582                 values[n++] = running +
3583                         atomic64_read(&event->child_total_time_running);
3584         }
3585         if (read_format & PERF_FORMAT_ID)
3586                 values[n++] = primary_event_id(event);
3587
3588         perf_output_copy(handle, values, n * sizeof(u64));
3589 }
3590
3591 /*
3592  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3593  */
3594 static void perf_output_read_group(struct perf_output_handle *handle,
3595                             struct perf_event *event,
3596                             u64 enabled, u64 running)
3597 {
3598         struct perf_event *leader = event->group_leader, *sub;
3599         u64 read_format = event->attr.read_format;
3600         u64 values[5];
3601         int n = 0;
3602
3603         values[n++] = 1 + leader->nr_siblings;
3604
3605         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3606                 values[n++] = enabled;
3607
3608         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3609                 values[n++] = running;
3610
3611         if (leader != event)
3612                 leader->pmu->read(leader);
3613
3614         values[n++] = perf_event_count(leader);
3615         if (read_format & PERF_FORMAT_ID)
3616                 values[n++] = primary_event_id(leader);
3617
3618         perf_output_copy(handle, values, n * sizeof(u64));
3619
3620         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3621                 n = 0;
3622
3623                 if (sub != event)
3624                         sub->pmu->read(sub);
3625
3626                 values[n++] = perf_event_count(sub);
3627                 if (read_format & PERF_FORMAT_ID)
3628                         values[n++] = primary_event_id(sub);
3629
3630                 perf_output_copy(handle, values, n * sizeof(u64));
3631         }
3632 }
3633
3634 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3635                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
3636
3637 static void perf_output_read(struct perf_output_handle *handle,
3638                              struct perf_event *event)
3639 {
3640         u64 enabled = 0, running = 0, now, ctx_time;
3641         u64 read_format = event->attr.read_format;
3642
3643         /*
3644          * compute total_time_enabled, total_time_running
3645          * based on snapshot values taken when the event
3646          * was last scheduled in.
3647          *
3648          * we cannot simply called update_context_time()
3649          * because of locking issue as we are called in
3650          * NMI context
3651          */
3652         if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3653                 now = perf_clock();
3654                 ctx_time = event->shadow_ctx_time + now;
3655                 enabled = ctx_time - event->tstamp_enabled;
3656                 running = ctx_time - event->tstamp_running;
3657         }
3658
3659         if (event->attr.read_format & PERF_FORMAT_GROUP)
3660                 perf_output_read_group(handle, event, enabled, running);
3661         else
3662                 perf_output_read_one(handle, event, enabled, running);
3663 }
3664
3665 void perf_output_sample(struct perf_output_handle *handle,
3666                         struct perf_event_header *header,
3667                         struct perf_sample_data *data,
3668                         struct perf_event *event)
3669 {
3670         u64 sample_type = data->type;
3671
3672         perf_output_put(handle, *header);
3673
3674         if (sample_type & PERF_SAMPLE_IP)
3675                 perf_output_put(handle, data->ip);
3676
3677         if (sample_type & PERF_SAMPLE_TID)
3678                 perf_output_put(handle, data->tid_entry);
3679
3680         if (sample_type & PERF_SAMPLE_TIME)
3681                 perf_output_put(handle, data->time);
3682
3683         if (sample_type & PERF_SAMPLE_ADDR)
3684                 perf_output_put(handle, data->addr);
3685
3686         if (sample_type & PERF_SAMPLE_ID)
3687                 perf_output_put(handle, data->id);
3688
3689         if (sample_type & PERF_SAMPLE_STREAM_ID)
3690                 perf_output_put(handle, data->stream_id);
3691
3692         if (sample_type & PERF_SAMPLE_CPU)
3693                 perf_output_put(handle, data->cpu_entry);
3694
3695         if (sample_type & PERF_SAMPLE_PERIOD)
3696                 perf_output_put(handle, data->period);
3697
3698         if (sample_type & PERF_SAMPLE_READ)
3699                 perf_output_read(handle, event);
3700
3701         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3702                 if (data->callchain) {
3703                         int size = 1;
3704
3705                         if (data->callchain)
3706                                 size += data->callchain->nr;
3707
3708                         size *= sizeof(u64);
3709
3710                         perf_output_copy(handle, data->callchain, size);
3711                 } else {
3712                         u64 nr = 0;
3713                         perf_output_put(handle, nr);
3714                 }
3715         }
3716
3717         if (sample_type & PERF_SAMPLE_RAW) {
3718                 if (data->raw) {
3719                         perf_output_put(handle, data->raw->size);
3720                         perf_output_copy(handle, data->raw->data,
3721                                          data->raw->size);
3722                 } else {
3723                         struct {
3724                                 u32     size;
3725                                 u32     data;
3726                         } raw = {
3727                                 .size = sizeof(u32),
3728                                 .data = 0,
3729                         };
3730                         perf_output_put(handle, raw);
3731                 }
3732         }
3733 }
3734
3735 void perf_prepare_sample(struct perf_event_header *header,
3736                          struct perf_sample_data *data,
3737                          struct perf_event *event,
3738                          struct pt_regs *regs)
3739 {
3740         u64 sample_type = event->attr.sample_type;
3741
3742         header->type = PERF_RECORD_SAMPLE;
3743         header->size = sizeof(*header) + event->header_size;
3744
3745         header->misc = 0;
3746         header->misc |= perf_misc_flags(regs);
3747
3748         __perf_event_header__init_id(header, data, event);
3749
3750         if (sample_type & PERF_SAMPLE_IP)
3751                 data->ip = perf_instruction_pointer(regs);
3752
3753         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3754                 int size = 1;
3755
3756                 data->callchain = perf_callchain(regs);
3757
3758                 if (data->callchain)
3759                         size += data->callchain->nr;
3760
3761                 header->size += size * sizeof(u64);
3762         }
3763
3764         if (sample_type & PERF_SAMPLE_RAW) {
3765                 int size = sizeof(u32);
3766
3767                 if (data->raw)
3768                         size += data->raw->size;
3769                 else
3770                         size += sizeof(u32);
3771
3772                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3773                 header->size += size;
3774         }
3775 }
3776
3777 static void perf_event_output(struct perf_event *event, int nmi,
3778                                 struct perf_sample_data *data,
3779                                 struct pt_regs *regs)
3780 {
3781         struct perf_output_handle handle;
3782         struct perf_event_header header;
3783
3784         /* protect the callchain buffers */
3785         rcu_read_lock();
3786
3787         perf_prepare_sample(&header, data, event, regs);
3788
3789         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3790                 goto exit;
3791
3792         perf_output_sample(&handle, &header, data, event);
3793
3794         perf_output_end(&handle);
3795
3796 exit:
3797         rcu_read_unlock();
3798 }
3799
3800 /*
3801  * read event_id
3802  */
3803
3804 struct perf_read_event {
3805         struct perf_event_header        header;
3806
3807         u32                             pid;
3808         u32                             tid;
3809 };
3810
3811 static void
3812 perf_event_read_event(struct perf_event *event,
3813                         struct task_struct *task)
3814 {
3815         struct perf_output_handle handle;
3816         struct perf_sample_data sample;
3817         struct perf_read_event read_event = {
3818                 .header = {
3819                         .type = PERF_RECORD_READ,
3820                         .misc = 0,
3821                         .size = sizeof(read_event) + event->read_size,
3822                 },
3823                 .pid = perf_event_pid(event, task),
3824                 .tid = perf_event_tid(event, task),
3825         };
3826         int ret;
3827
3828         perf_event_header__init_id(&read_event.header, &sample, event);
3829         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3830         if (ret)
3831                 return;
3832
3833         perf_output_put(&handle, read_event);
3834         perf_output_read(&handle, event);
3835         perf_event__output_id_sample(event, &handle, &sample);
3836
3837         perf_output_end(&handle);
3838 }
3839
3840 /*
3841  * task tracking -- fork/exit
3842  *
3843  * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
3844  */
3845
3846 struct perf_task_event {
3847         struct task_struct              *task;
3848         struct perf_event_context       *task_ctx;
3849
3850         struct {
3851                 struct perf_event_header        header;
3852
3853                 u32                             pid;
3854                 u32                             ppid;
3855                 u32                             tid;
3856                 u32                             ptid;
3857                 u64                             time;
3858         } event_id;
3859 };
3860
3861 static void perf_event_task_output(struct perf_event *event,
3862                                      struct perf_task_event *task_event)
3863 {
3864         struct perf_output_handle handle;
3865         struct perf_sample_data sample;
3866         struct task_struct *task = task_event->task;
3867         int ret, size = task_event->event_id.header.size;
3868
3869         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3870
3871         ret = perf_output_begin(&handle, event,
3872                                 task_event->event_id.header.size, 0, 0);
3873         if (ret)
3874                 goto out;
3875
3876         task_event->event_id.pid = perf_event_pid(event, task);
3877         task_event->event_id.ppid = perf_event_pid(event, current);
3878
3879         task_event->event_id.tid = perf_event_tid(event, task);
3880         task_event->event_id.ptid = perf_event_tid(event, current);
3881
3882         perf_output_put(&handle, task_event->event_id);
3883
3884         perf_event__output_id_sample(event, &handle, &sample);
3885
3886         perf_output_end(&handle);
3887 out:
3888         task_event->event_id.header.size = size;
3889 }
3890
3891 static int perf_event_task_match(struct perf_event *event)
3892 {
3893         if (event->state < PERF_EVENT_STATE_INACTIVE)
3894                 return 0;
3895
3896         if (event->cpu != -1 && event->cpu != smp_processor_id())
3897                 return 0;
3898
3899         if (event->attr.comm || event->attr.mmap ||
3900             event->attr.mmap_data || event->attr.task)
3901                 return 1;
3902
3903         return 0;
3904 }
3905
3906 static void perf_event_task_ctx(struct perf_event_context *ctx,
3907                                   struct perf_task_event *task_event)
3908 {
3909         struct perf_event *event;
3910
3911         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3912                 if (perf_event_task_match(event))
3913                         perf_event_task_output(event, task_event);
3914         }
3915 }
3916
3917 static void perf_event_task_event(struct perf_task_event *task_event)
3918 {
3919         struct perf_cpu_context *cpuctx;
3920         struct perf_event_context *ctx;
3921         struct pmu *pmu;
3922         int ctxn;
3923
3924         rcu_read_lock();
3925         list_for_each_entry_rcu(pmu, &pmus, entry) {
3926                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3927                 if (cpuctx->active_pmu != pmu)
3928                         goto next;
3929                 perf_event_task_ctx(&cpuctx->ctx, task_event);
3930
3931                 ctx = task_event->task_ctx;
3932                 if (!ctx) {
3933                         ctxn = pmu->task_ctx_nr;
3934                         if (ctxn < 0)
3935                                 goto next;
3936                         ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3937                 }
3938                 if (ctx)
3939                         perf_event_task_ctx(ctx, task_event);
3940 next:
3941                 put_cpu_ptr(pmu->pmu_cpu_context);
3942         }
3943         rcu_read_unlock();
3944 }
3945
3946 static void perf_event_task(struct task_struct *task,
3947                               struct perf_event_context *task_ctx,
3948                               int new)
3949 {
3950         struct perf_task_event task_event;
3951
3952         if (!atomic_read(&nr_comm_events) &&
3953             !atomic_read(&nr_mmap_events) &&
3954             !atomic_read(&nr_task_events))
3955                 return;
3956
3957         task_event = (struct perf_task_event){
3958                 .task     = task,
3959                 .task_ctx = task_ctx,
3960                 .event_id    = {
3961                         .header = {
3962                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3963                                 .misc = 0,
3964                                 .size = sizeof(task_event.event_id),
3965                         },
3966                         /* .pid  */
3967                         /* .ppid */
3968                         /* .tid  */
3969                         /* .ptid */
3970                         .time = perf_clock(),
3971                 },
3972         };
3973
3974         perf_event_task_event(&task_event);
3975 }
3976
3977 void perf_event_fork(struct task_struct *task)
3978 {
3979         perf_event_task(task, NULL, 1);
3980 }
3981
3982 /*
3983  * comm tracking
3984  */
3985
3986 struct perf_comm_event {
3987         struct task_struct      *task;
3988         char                    *comm;
3989         int                     comm_size;
3990
3991         struct {
3992                 struct perf_event_header        header;
3993
3994                 u32                             pid;
3995                 u32                             tid;
3996         } event_id;
3997 };
3998
3999 static void perf_event_comm_output(struct perf_event *event,
4000                                      struct perf_comm_event *comm_event)
4001 {
4002         struct perf_output_handle handle;
4003         struct perf_sample_data sample;
4004         int size = comm_event->event_id.header.size;
4005         int ret;
4006
4007         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4008         ret = perf_output_begin(&handle, event,
4009                                 comm_event->event_id.header.size, 0, 0);
4010
4011         if (ret)
4012                 goto out;
4013
4014         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4015         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4016
4017         perf_output_put(&handle, comm_event->event_id);
4018         perf_output_copy(&handle, comm_event->comm,
4019                                    comm_event->comm_size);
4020
4021         perf_event__output_id_sample(event, &handle, &sample);
4022
4023         perf_output_end(&handle);
4024 out:
4025         comm_event->event_id.header.size = size;
4026 }
4027
4028 static int perf_event_comm_match(struct perf_event *event)
4029 {
4030         if (event->state < PERF_EVENT_STATE_INACTIVE)
4031                 return 0;
4032
4033         if (event->cpu != -1 && event->cpu != smp_processor_id())
4034                 return 0;
4035
4036         if (event->attr.comm)
4037                 return 1;
4038
4039         return 0;
4040 }
4041
4042 static void perf_event_comm_ctx(struct perf_event_context *ctx,
4043                                   struct perf_comm_event *comm_event)
4044 {
4045         struct perf_event *event;
4046
4047         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4048                 if (perf_event_comm_match(event))
4049                         perf_event_comm_output(event, comm_event);
4050         }
4051 }
4052
4053 static void perf_event_comm_event(struct perf_comm_event *comm_event)
4054 {
4055         struct perf_cpu_context *cpuctx;
4056         struct perf_event_context *ctx;
4057         char comm[TASK_COMM_LEN];
4058         unsigned int size;
4059         struct pmu *pmu;
4060         int ctxn;
4061
4062         memset(comm, 0, sizeof(comm));
4063         strlcpy(comm, comm_event->task->comm, sizeof(comm));
4064         size = ALIGN(strlen(comm)+1, sizeof(u64));
4065
4066         comm_event->comm = comm;
4067         comm_event->comm_size = size;
4068
4069         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4070         rcu_read_lock();
4071         list_for_each_entry_rcu(pmu, &pmus, entry) {
4072                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4073                 if (cpuctx->active_pmu != pmu)
4074                         goto next;
4075                 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4076
4077                 ctxn = pmu->task_ctx_nr;
4078                 if (ctxn < 0)
4079                         goto next;
4080
4081                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4082                 if (ctx)
4083                         perf_event_comm_ctx(ctx, comm_event);
4084 next:
4085                 put_cpu_ptr(pmu->pmu_cpu_context);
4086         }
4087         rcu_read_unlock();
4088 }
4089
4090 void perf_event_comm(struct task_struct *task)
4091 {
4092         struct perf_comm_event comm_event;
4093         struct perf_event_context *ctx;
4094         int ctxn;
4095
4096         for_each_task_context_nr(ctxn) {
4097                 ctx = task->perf_event_ctxp[ctxn];
4098                 if (!ctx)
4099                         continue;
4100
4101                 perf_event_enable_on_exec(ctx);
4102         }
4103
4104         if (!atomic_read(&nr_comm_events))
4105                 return;
4106
4107         comm_event = (struct perf_comm_event){
4108                 .task   = task,
4109                 /* .comm      */
4110                 /* .comm_size */
4111                 .event_id  = {
4112                         .header = {
4113                                 .type = PERF_RECORD_COMM,
4114                                 .misc = 0,
4115                                 /* .size */
4116                         },
4117                         /* .pid */
4118                         /* .tid */
4119                 },
4120         };
4121
4122         perf_event_comm_event(&comm_event);
4123 }
4124
4125 /*
4126  * mmap tracking
4127  */
4128
4129 struct perf_mmap_event {
4130         struct vm_area_struct   *vma;
4131
4132         const char              *file_name;
4133         int                     file_size;
4134
4135         struct {
4136                 struct perf_event_header        header;
4137
4138                 u32                             pid;
4139                 u32                             tid;
4140                 u64                             start;
4141                 u64                             len;
4142                 u64                             pgoff;
4143         } event_id;
4144 };
4145
4146 static void perf_event_mmap_output(struct perf_event *event,
4147                                      struct perf_mmap_event *mmap_event)
4148 {
4149         struct perf_output_handle handle;
4150         struct perf_sample_data sample;
4151         int size = mmap_event->event_id.header.size;
4152         int ret;
4153
4154         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4155         ret = perf_output_begin(&handle, event,
4156                                 mmap_event->event_id.header.size, 0, 0);
4157         if (ret)
4158                 goto out;
4159
4160         mmap_event->event_id.pid = perf_event_pid(event, current);
4161         mmap_event->event_id.tid = perf_event_tid(event, current);
4162
4163         perf_output_put(&handle, mmap_event->event_id);
4164         perf_output_copy(&handle, mmap_event->file_name,
4165                                    mmap_event->file_size);
4166
4167         perf_event__output_id_sample(event, &handle, &sample);
4168
4169         perf_output_end(&handle);
4170 out:
4171         mmap_event->event_id.header.size = size;
4172 }
4173
4174 static int perf_event_mmap_match(struct perf_event *event,
4175                                    struct perf_mmap_event *mmap_event,
4176                                    int executable)
4177 {
4178         if (event->state < PERF_EVENT_STATE_INACTIVE)
4179                 return 0;
4180
4181         if (event->cpu != -1 && event->cpu != smp_processor_id())
4182                 return 0;
4183
4184         if ((!executable && event->attr.mmap_data) ||
4185             (executable && event->attr.mmap))
4186                 return 1;
4187
4188         return 0;
4189 }
4190
4191 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4192                                   struct perf_mmap_event *mmap_event,
4193                                   int executable)
4194 {
4195         struct perf_event *event;
4196
4197         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4198                 if (perf_event_mmap_match(event, mmap_event, executable))
4199                         perf_event_mmap_output(event, mmap_event);
4200         }
4201 }
4202
4203 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4204 {
4205         struct perf_cpu_context *cpuctx;
4206         struct perf_event_context *ctx;
4207         struct vm_area_struct *vma = mmap_event->vma;
4208         struct file *file = vma->vm_file;
4209         unsigned int size;
4210         char tmp[16];
4211         char *buf = NULL;
4212         const char *name;
4213         struct pmu *pmu;
4214         int ctxn;
4215
4216         memset(tmp, 0, sizeof(tmp));
4217
4218         if (file) {
4219                 /*
4220                  * d_path works from the end of the buffer backwards, so we
4221                  * need to add enough zero bytes after the string to handle
4222                  * the 64bit alignment we do later.
4223                  */
4224                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4225                 if (!buf) {
4226                         name = strncpy(tmp, "//enomem", sizeof(tmp));
4227                         goto got_name;
4228                 }
4229                 name = d_path(&file->f_path, buf, PATH_MAX);
4230                 if (IS_ERR(name)) {
4231                         name = strncpy(tmp, "//toolong", sizeof(tmp));
4232                         goto got_name;
4233                 }
4234         } else {
4235                 if (arch_vma_name(mmap_event->vma)) {
4236                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4237                                        sizeof(tmp));
4238                         goto got_name;
4239                 }
4240
4241                 if (!vma->vm_mm) {
4242                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
4243                         goto got_name;
4244                 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4245                                 vma->vm_end >= vma->vm_mm->brk) {
4246                         name = strncpy(tmp, "[heap]", sizeof(tmp));
4247                         goto got_name;
4248                 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4249                                 vma->vm_end >= vma->vm_mm->start_stack) {
4250                         name = strncpy(tmp, "[stack]", sizeof(tmp));
4251                         goto got_name;
4252                 }
4253
4254                 name = strncpy(tmp, "//anon", sizeof(tmp));
4255                 goto got_name;
4256         }
4257
4258 got_name:
4259         size = ALIGN(strlen(name)+1, sizeof(u64));
4260
4261         mmap_event->file_name = name;
4262         mmap_event->file_size = size;
4263
4264         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4265
4266         rcu_read_lock();
4267         list_for_each_entry_rcu(pmu, &pmus, entry) {
4268                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4269                 if (cpuctx->active_pmu != pmu)
4270                         goto next;
4271                 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4272                                         vma->vm_flags & VM_EXEC);
4273
4274                 ctxn = pmu->task_ctx_nr;
4275                 if (ctxn < 0)
4276                         goto next;
4277
4278                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4279                 if (ctx) {
4280                         perf_event_mmap_ctx(ctx, mmap_event,
4281                                         vma->vm_flags & VM_EXEC);
4282                 }
4283 next:
4284                 put_cpu_ptr(pmu->pmu_cpu_context);
4285         }
4286         rcu_read_unlock();
4287
4288         kfree(buf);
4289 }
4290
4291 void perf_event_mmap(struct vm_area_struct *vma)
4292 {
4293         struct perf_mmap_event mmap_event;
4294
4295         if (!atomic_read(&nr_mmap_events))
4296                 return;
4297
4298         mmap_event = (struct perf_mmap_event){
4299                 .vma    = vma,
4300                 /* .file_name */
4301                 /* .file_size */
4302                 .event_id  = {
4303                         .header = {
4304                                 .type = PERF_RECORD_MMAP,
4305                                 .misc = PERF_RECORD_MISC_USER,
4306                                 /* .size */
4307                         },
4308                         /* .pid */
4309                         /* .tid */
4310                         .start  = vma->vm_start,
4311                         .len    = vma->vm_end - vma->vm_start,
4312                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
4313                 },
4314         };
4315
4316         perf_event_mmap_event(&mmap_event);
4317 }
4318
4319 /*
4320  * IRQ throttle logging
4321  */
4322
4323 static void perf_log_throttle(struct perf_event *event, int enable)
4324 {
4325         struct perf_output_handle handle;
4326         struct perf_sample_data sample;
4327         int ret;
4328
4329         struct {
4330                 struct perf_event_header        header;
4331                 u64                             time;
4332                 u64                             id;
4333                 u64                             stream_id;
4334         } throttle_event = {
4335                 .header = {
4336                         .type = PERF_RECORD_THROTTLE,
4337                         .misc = 0,
4338                         .size = sizeof(throttle_event),
4339                 },
4340                 .time           = perf_clock(),
4341                 .id             = primary_event_id(event),
4342                 .stream_id      = event->id,
4343         };
4344
4345         if (enable)
4346                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4347
4348         perf_event_header__init_id(&throttle_event.header, &sample, event);
4349
4350         ret = perf_output_begin(&handle, event,
4351                                 throttle_event.header.size, 1, 0);
4352         if (ret)
4353                 return;
4354
4355         perf_output_put(&handle, throttle_event);
4356         perf_event__output_id_sample(event, &handle, &sample);
4357         perf_output_end(&handle);
4358 }
4359
4360 /*
4361  * Generic event overflow handling, sampling.
4362  */
4363
4364 static int __perf_event_overflow(struct perf_event *event, int nmi,
4365                                    int throttle, struct perf_sample_data *data,
4366                                    struct pt_regs *regs)
4367 {
4368         int events = atomic_read(&event->event_limit);
4369         struct hw_perf_event *hwc = &event->hw;
4370         int ret = 0;
4371
4372         /*
4373          * Non-sampling counters might still use the PMI to fold short
4374          * hardware counters, ignore those.
4375          */
4376         if (unlikely(!is_sampling_event(event)))
4377                 return 0;
4378
4379         if (!throttle) {
4380                 hwc->interrupts++;
4381         } else {
4382                 if (hwc->interrupts != MAX_INTERRUPTS) {
4383                         hwc->interrupts++;
4384                         if (HZ * hwc->interrupts >
4385                                         (u64)sysctl_perf_event_sample_rate) {
4386                                 hwc->interrupts = MAX_INTERRUPTS;
4387                                 perf_log_throttle(event, 0);
4388                                 ret = 1;
4389                         }
4390                 } else {
4391                         /*
4392                          * Keep re-disabling events even though on the previous
4393                          * pass we disabled it - just in case we raced with a
4394                          * sched-in and the event got enabled again:
4395                          */
4396                         ret = 1;
4397                 }
4398         }
4399
4400         if (event->attr.freq) {
4401                 u64 now = perf_clock();
4402                 s64 delta = now - hwc->freq_time_stamp;
4403
4404                 hwc->freq_time_stamp = now;
4405
4406                 if (delta > 0 && delta < 2*TICK_NSEC)
4407                         perf_adjust_period(event, delta, hwc->last_period);
4408         }
4409
4410         /*
4411          * XXX event_limit might not quite work as expected on inherited
4412          * events
4413          */
4414
4415         event->pending_kill = POLL_IN;
4416         if (events && atomic_dec_and_test(&event->event_limit)) {
4417                 ret = 1;
4418                 event->pending_kill = POLL_HUP;
4419                 if (nmi) {
4420                         event->pending_disable = 1;
4421                         irq_work_queue(&event->pending);
4422                 } else
4423                         perf_event_disable(event);
4424         }
4425
4426         if (event->overflow_handler)
4427                 event->overflow_handler(event, nmi, data, regs);
4428         else
4429                 perf_event_output(event, nmi, data, regs);
4430
4431         return ret;
4432 }
4433
4434 int perf_event_overflow(struct perf_event *event, int nmi,
4435                           struct perf_sample_data *data,
4436                           struct pt_regs *regs)
4437 {
4438         return __perf_event_overflow(event, nmi, 1, data, regs);
4439 }
4440
4441 /*
4442  * Generic software event infrastructure
4443  */
4444
4445 struct swevent_htable {
4446         struct swevent_hlist            *swevent_hlist;
4447         struct mutex                    hlist_mutex;
4448         int                             hlist_refcount;
4449
4450         /* Recursion avoidance in each contexts */
4451         int                             recursion[PERF_NR_CONTEXTS];
4452 };
4453
4454 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4455
4456 /*
4457  * We directly increment event->count and keep a second value in
4458  * event->hw.period_left to count intervals. This period event
4459  * is kept in the range [-sample_period, 0] so that we can use the
4460  * sign as trigger.
4461  */
4462
4463 static u64 perf_swevent_set_period(struct perf_event *event)
4464 {
4465         struct hw_perf_event *hwc = &event->hw;
4466         u64 period = hwc->last_period;
4467         u64 nr, offset;
4468         s64 old, val;
4469
4470         hwc->last_period = hwc->sample_period;
4471
4472 again:
4473         old = val = local64_read(&hwc->period_left);
4474         if (val < 0)
4475                 return 0;
4476
4477         nr = div64_u64(period + val, period);
4478         offset = nr * period;
4479         val -= offset;
4480         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4481                 goto again;
4482
4483         return nr;
4484 }
4485
4486 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4487                                     int nmi, struct perf_sample_data *data,
4488                                     struct pt_regs *regs)
4489 {
4490         struct hw_perf_event *hwc = &event->hw;
4491         int throttle = 0;
4492
4493         data->period = event->hw.last_period;
4494         if (!overflow)
4495                 overflow = perf_swevent_set_period(event);
4496
4497         if (hwc->interrupts == MAX_INTERRUPTS)
4498                 return;
4499
4500         for (; overflow; overflow--) {
4501                 if (__perf_event_overflow(event, nmi, throttle,
4502                                             data, regs)) {
4503                         /*
4504                          * We inhibit the overflow from happening when
4505                          * hwc->interrupts == MAX_INTERRUPTS.
4506                          */
4507                         break;
4508                 }
4509                 throttle = 1;
4510         }
4511 }
4512
4513 static void perf_swevent_event(struct perf_event *event, u64 nr,
4514                                int nmi, struct perf_sample_data *data,
4515                                struct pt_regs *regs)
4516 {
4517         struct hw_perf_event *hwc = &event->hw;
4518
4519         local64_add(nr, &event->count);
4520
4521         if (!regs)
4522                 return;
4523
4524         if (!is_sampling_event(event))
4525                 return;
4526
4527         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4528                 return perf_swevent_overflow(event, 1, nmi, data, regs);
4529
4530         if (local64_add_negative(nr, &hwc->period_left))
4531                 return;
4532
4533         perf_swevent_overflow(event, 0, nmi, data, regs);
4534 }
4535
4536 static int perf_exclude_event(struct perf_event *event,
4537                               struct pt_regs *regs)
4538 {
4539         if (event->hw.state & PERF_HES_STOPPED)
4540                 return 0;
4541
4542         if (regs) {
4543                 if (event->attr.exclude_user && user_mode(regs))
4544                         return 1;
4545
4546                 if (event->attr.exclude_kernel && !user_mode(regs))
4547                         return 1;
4548         }
4549
4550         return 0;
4551 }
4552
4553 static int perf_swevent_match(struct perf_event *event,
4554                                 enum perf_type_id type,
4555                                 u32 event_id,
4556                                 struct perf_sample_data *data,
4557                                 struct pt_regs *regs)
4558 {
4559         if (event->attr.type != type)
4560                 return 0;
4561
4562         if (event->attr.config != event_id)
4563                 return 0;
4564
4565         if (perf_exclude_event(event, regs))
4566                 return 0;
4567
4568         return 1;
4569 }
4570
4571 static inline u64 swevent_hash(u64 type, u32 event_id)
4572 {
4573         u64 val = event_id | (type << 32);
4574
4575         return hash_64(val, SWEVENT_HLIST_BITS);
4576 }
4577
4578 static inline struct hlist_head *
4579 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4580 {
4581         u64 hash = swevent_hash(type, event_id);
4582
4583         return &hlist->heads[hash];
4584 }
4585
4586 /* For the read side: events when they trigger */
4587 static inline struct hlist_head *
4588 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4589 {
4590         struct swevent_hlist *hlist;
4591
4592         hlist = rcu_dereference(swhash->swevent_hlist);
4593         if (!hlist)
4594                 return NULL;
4595
4596         return __find_swevent_head(hlist, type, event_id);
4597 }
4598
4599 /* For the event head insertion and removal in the hlist */
4600 static inline struct hlist_head *
4601 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4602 {
4603         struct swevent_hlist *hlist;
4604         u32 event_id = event->attr.config;
4605         u64 type = event->attr.type;
4606
4607         /*
4608          * Event scheduling is always serialized against hlist allocation
4609          * and release. Which makes the protected version suitable here.
4610          * The context lock guarantees that.
4611          */
4612         hlist = rcu_dereference_protected(swhash->swevent_hlist,
4613                                           lockdep_is_held(&event->ctx->lock));
4614         if (!hlist)
4615                 return NULL;
4616
4617         return __find_swevent_head(hlist, type, event_id);
4618 }
4619
4620 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4621                                     u64 nr, int nmi,
4622                                     struct perf_sample_data *data,
4623                                     struct pt_regs *regs)
4624 {
4625         struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4626         struct perf_event *event;
4627         struct hlist_node *node;
4628         struct hlist_head *head;
4629
4630         rcu_read_lock();
4631         head = find_swevent_head_rcu(swhash, type, event_id);
4632         if (!head)
4633                 goto end;
4634
4635         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4636                 if (perf_swevent_match(event, type, event_id, data, regs))
4637                         perf_swevent_event(event, nr, nmi, data, regs);
4638         }
4639 end:
4640         rcu_read_unlock();
4641 }
4642
4643 int perf_swevent_get_recursion_context(void)
4644 {
4645         struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4646
4647         return get_recursion_context(swhash->recursion);
4648 }
4649 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4650
4651 void inline perf_swevent_put_recursion_context(int rctx)
4652 {
4653         struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4654
4655         put_recursion_context(swhash->recursion, rctx);
4656 }
4657
4658 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4659                             struct pt_regs *regs, u64 addr)
4660 {
4661         struct perf_sample_data data;
4662         int rctx;
4663
4664         preempt_disable_notrace();
4665         rctx = perf_swevent_get_recursion_context();
4666         if (rctx < 0)
4667                 return;
4668
4669         perf_sample_data_init(&data, addr);
4670
4671         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4672
4673         perf_swevent_put_recursion_context(rctx);
4674         preempt_enable_notrace();
4675 }
4676
4677 static void perf_swevent_read(struct perf_event *event)
4678 {
4679 }
4680
4681 static int perf_swevent_add(struct perf_event *event, int flags)
4682 {
4683         struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4684         struct hw_perf_event *hwc = &event->hw;
4685         struct hlist_head *head;
4686
4687         if (is_sampling_event(event)) {
4688                 hwc->last_period = hwc->sample_period;
4689                 perf_swevent_set_period(event);
4690         }
4691
4692         hwc->state = !(flags & PERF_EF_START);
4693
4694         head = find_swevent_head(swhash, event);
4695         if (WARN_ON_ONCE(!head))
4696                 return -EINVAL;
4697
4698         hlist_add_head_rcu(&event->hlist_entry, head);
4699
4700         return 0;
4701 }
4702
4703 static void perf_swevent_del(struct perf_event *event, int flags)
4704 {
4705         hlist_del_rcu(&event->hlist_entry);
4706 }
4707
4708 static void perf_swevent_start(struct perf_event *event, int flags)
4709 {
4710         event->hw.state = 0;
4711 }
4712
4713 static void perf_swevent_stop(struct perf_event *event, int flags)
4714 {
4715         event->hw.state = PERF_HES_STOPPED;
4716 }
4717
4718 /* Deref the hlist from the update side */
4719 static inline struct swevent_hlist *
4720 swevent_hlist_deref(struct swevent_htable *swhash)
4721 {
4722         return rcu_dereference_protected(swhash->swevent_hlist,
4723                                          lockdep_is_held(&swhash->hlist_mutex));
4724 }
4725
4726 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4727 {
4728         struct swevent_hlist *hlist;
4729
4730         hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4731         kfree(hlist);
4732 }
4733
4734 static void swevent_hlist_release(struct swevent_htable *swhash)
4735 {
4736         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4737
4738         if (!hlist)
4739                 return;
4740
4741         rcu_assign_pointer(swhash->swevent_hlist, NULL);
4742         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4743 }
4744
4745 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4746 {
4747         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4748
4749         mutex_lock(&swhash->hlist_mutex);
4750
4751         if (!--swhash->hlist_refcount)
4752                 swevent_hlist_release(swhash);
4753
4754         mutex_unlock(&swhash->hlist_mutex);
4755 }
4756
4757 static void swevent_hlist_put(struct perf_event *event)
4758 {
4759         int cpu;
4760
4761         if (event->cpu != -1) {
4762                 swevent_hlist_put_cpu(event, event->cpu);
4763                 return;
4764         }
4765
4766         for_each_possible_cpu(cpu)
4767                 swevent_hlist_put_cpu(event, cpu);
4768 }
4769
4770 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4771 {
4772         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4773         int err = 0;
4774
4775         mutex_lock(&swhash->hlist_mutex);
4776
4777         if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4778                 struct swevent_hlist *hlist;
4779
4780                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4781                 if (!hlist) {
4782                         err = -ENOMEM;
4783                         goto exit;
4784                 }
4785                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4786         }
4787         swhash->hlist_refcount++;
4788 exit:
4789         mutex_unlock(&swhash->hlist_mutex);
4790
4791         return err;
4792 }
4793
4794 static int swevent_hlist_get(struct perf_event *event)
4795 {
4796         int err;
4797         int cpu, failed_cpu;
4798
4799         if (event->cpu != -1)
4800                 return swevent_hlist_get_cpu(event, event->cpu);
4801
4802         get_online_cpus();
4803         for_each_possible_cpu(cpu) {
4804                 err = swevent_hlist_get_cpu(event, cpu);
4805                 if (err) {
4806                         failed_cpu = cpu;
4807                         goto fail;
4808                 }
4809         }
4810         put_online_cpus();
4811
4812         return 0;
4813 fail:
4814         for_each_possible_cpu(cpu) {
4815                 if (cpu == failed_cpu)
4816                         break;
4817                 swevent_hlist_put_cpu(event, cpu);
4818         }
4819
4820         put_online_cpus();
4821         return err;
4822 }
4823
4824 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4825
4826 static void sw_perf_event_destroy(struct perf_event *event)
4827 {
4828         u64 event_id = event->attr.config;
4829
4830         WARN_ON(event->parent);
4831
4832         jump_label_dec(&perf_swevent_enabled[event_id]);
4833         swevent_hlist_put(event);
4834 }
4835
4836 static int perf_swevent_init(struct perf_event *event)
4837 {
4838         int event_id = event->attr.config;
4839
4840         if (event->attr.type != PERF_TYPE_SOFTWARE)
4841                 return -ENOENT;
4842
4843         switch (event_id) {
4844         case PERF_COUNT_SW_CPU_CLOCK:
4845         case PERF_COUNT_SW_TASK_CLOCK:
4846                 return -ENOENT;
4847
4848         default:
4849                 break;
4850         }
4851
4852         if (event_id >= PERF_COUNT_SW_MAX)
4853                 return -ENOENT;
4854
4855         if (!event->parent) {
4856                 int err;
4857
4858                 err = swevent_hlist_get(event);
4859                 if (err)
4860                         return err;
4861
4862                 jump_label_inc(&perf_swevent_enabled[event_id]);
4863                 event->destroy = sw_perf_event_destroy;
4864         }
4865
4866         return 0;
4867 }
4868
4869 static struct pmu perf_swevent = {
4870         .task_ctx_nr    = perf_sw_context,
4871
4872         .event_init     = perf_swevent_init,
4873         .add            = perf_swevent_add,
4874         .del            = perf_swevent_del,
4875         .start          = perf_swevent_start,
4876         .stop           = perf_swevent_stop,
4877         .read           = perf_swevent_read,
4878 };
4879
4880 #ifdef CONFIG_EVENT_TRACING
4881
4882 static int perf_tp_filter_match(struct perf_event *event,
4883                                 struct perf_sample_data *data)
4884 {
4885         void *record = data->raw->data;
4886
4887         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4888                 return 1;
4889         return 0;
4890 }
4891
4892 static int perf_tp_event_match(struct perf_event *event,
4893                                 struct perf_sample_data *data,
4894                                 struct pt_regs *regs)
4895 {
4896         /*
4897          * All tracepoints are from kernel-space.
4898          */
4899         if (event->attr.exclude_kernel)
4900                 return 0;
4901
4902         if (!perf_tp_filter_match(event, data))
4903                 return 0;
4904
4905         return 1;
4906 }
4907
4908 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4909                    struct pt_regs *regs, struct hlist_head *head, int rctx)
4910 {
4911         struct perf_sample_data data;
4912         struct perf_event *event;
4913         struct hlist_node *node;
4914
4915         struct perf_raw_record raw = {
4916                 .size = entry_size,
4917                 .data = record,
4918         };
4919
4920         perf_sample_data_init(&data, addr);
4921         data.raw = &raw;
4922
4923         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4924                 if (perf_tp_event_match(event, &data, regs))
4925                         perf_swevent_event(event, count, 1, &data, regs);
4926         }
4927
4928         perf_swevent_put_recursion_context(rctx);
4929 }
4930 EXPORT_SYMBOL_GPL(perf_tp_event);
4931
4932 static void tp_perf_event_destroy(struct perf_event *event)
4933 {
4934         perf_trace_destroy(event);
4935 }
4936
4937 static int perf_tp_event_init(struct perf_event *event)
4938 {
4939         int err;
4940
4941         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4942                 return -ENOENT;
4943
4944         err = perf_trace_init(event);
4945         if (err)
4946                 return err;
4947
4948         event->destroy = tp_perf_event_destroy;
4949
4950         return 0;
4951 }
4952
4953 static struct pmu perf_tracepoint = {
4954         .task_ctx_nr    = perf_sw_context,
4955
4956         .event_init     = perf_tp_event_init,
4957         .add            = perf_trace_add,
4958         .del            = perf_trace_del,
4959         .start          = perf_swevent_start,
4960         .stop           = perf_swevent_stop,
4961         .read           = perf_swevent_read,
4962 };
4963
4964 static inline void perf_tp_register(void)
4965 {
4966         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4967 }
4968
4969 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4970 {
4971         char *filter_str;
4972         int ret;
4973
4974         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4975                 return -EINVAL;
4976
4977         filter_str = strndup_user(arg, PAGE_SIZE);
4978         if (IS_ERR(filter_str))
4979                 return PTR_ERR(filter_str);
4980
4981         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4982
4983         kfree(filter_str);
4984         return ret;
4985 }
4986
4987 static void perf_event_free_filter(struct perf_event *event)
4988 {
4989         ftrace_profile_free_filter(event);
4990 }
4991
4992 #else
4993
4994 static inline void perf_tp_register(void)
4995 {
4996 }
4997
4998 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4999 {
5000         return -ENOENT;
5001 }
5002
5003 static void perf_event_free_filter(struct perf_event *event)
5004 {
5005 }
5006
5007 #endif /* CONFIG_EVENT_TRACING */
5008
5009 #ifdef CONFIG_HAVE_HW_BREAKPOINT
5010 void perf_bp_event(struct perf_event *bp, void *data)
5011 {
5012         struct perf_sample_data sample;
5013         struct pt_regs *regs = data;
5014
5015         perf_sample_data_init(&sample, bp->attr.bp_addr);
5016
5017         if (!bp->hw.state && !perf_exclude_event(bp, regs))
5018                 perf_swevent_event(bp, 1, 1, &sample, regs);
5019 }
5020 #endif
5021
5022 /*
5023  * hrtimer based swevent callback
5024  */
5025
5026 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5027 {
5028         enum hrtimer_restart ret = HRTIMER_RESTART;
5029         struct perf_sample_data data;
5030         struct pt_regs *regs;
5031         struct perf_event *event;
5032         u64 period;
5033
5034         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5035         event->pmu->read(event);
5036
5037         perf_sample_data_init(&data, 0);
5038         data.period = event->hw.last_period;
5039         regs = get_irq_regs();
5040
5041         if (regs && !perf_exclude_event(event, regs)) {
5042                 if (!(event->attr.exclude_idle && current->pid == 0))
5043                         if (perf_event_overflow(event, 0, &data, regs))
5044                                 ret = HRTIMER_NORESTART;
5045         }
5046
5047         period = max_t(u64, 10000, event->hw.sample_period);
5048         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5049
5050         return ret;
5051 }
5052
5053 static void perf_swevent_start_hrtimer(struct perf_event *event)
5054 {
5055         struct hw_perf_event *hwc = &event->hw;
5056         s64 period;
5057
5058         if (!is_sampling_event(event))
5059                 return;
5060
5061         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5062         hwc->hrtimer.function = perf_swevent_hrtimer;
5063
5064         period = local64_read(&hwc->period_left);
5065         if (period) {
5066                 if (period < 0)
5067                         period = 10000;
5068
5069                 local64_set(&hwc->period_left, 0);
5070         } else {
5071                 period = max_t(u64, 10000, hwc->sample_period);
5072         }
5073         __hrtimer_start_range_ns(&hwc->hrtimer,
5074                                 ns_to_ktime(period), 0,
5075                                 HRTIMER_MODE_REL_PINNED, 0);
5076 }
5077
5078 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5079 {
5080         struct hw_perf_event *hwc = &event->hw;
5081
5082         if (is_sampling_event(event)) {
5083                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5084                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
5085
5086                 hrtimer_cancel(&hwc->hrtimer);
5087         }
5088 }
5089
5090 /*
5091  * Software event: cpu wall time clock
5092  */
5093
5094 static void cpu_clock_event_update(struct perf_event *event)
5095 {
5096         s64 prev;
5097         u64 now;
5098
5099         now = local_clock();
5100         prev = local64_xchg(&event->hw.prev_count, now);
5101         local64_add(now - prev, &event->count);
5102 }
5103
5104 static void cpu_clock_event_start(struct perf_event *event, int flags)
5105 {
5106         local64_set(&event->hw.prev_count, local_clock());
5107         perf_swevent_start_hrtimer(event);
5108 }
5109
5110 static void cpu_clock_event_stop(struct perf_event *event, int flags)
5111 {
5112         perf_swevent_cancel_hrtimer(event);
5113         cpu_clock_event_update(event);
5114 }
5115
5116 static int cpu_clock_event_add(struct perf_event *event, int flags)
5117 {
5118         if (flags & PERF_EF_START)
5119                 cpu_clock_event_start(event, flags);
5120
5121         return 0;
5122 }
5123
5124 static void cpu_clock_event_del(struct perf_event *event, int flags)
5125 {
5126         cpu_clock_event_stop(event, flags);
5127 }
5128
5129 static void cpu_clock_event_read(struct perf_event *event)
5130 {
5131         cpu_clock_event_update(event);
5132 }
5133
5134 static int cpu_clock_event_init(struct perf_event *event)
5135 {
5136         if (event->attr.type != PERF_TYPE_SOFTWARE)
5137                 return -ENOENT;
5138
5139         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5140                 return -ENOENT;
5141
5142         return 0;
5143 }
5144
5145 static struct pmu perf_cpu_clock = {
5146         .task_ctx_nr    = perf_sw_context,
5147
5148         .event_init     = cpu_clock_event_init,
5149         .add            = cpu_clock_event_add,
5150         .del            = cpu_clock_event_del,
5151         .start          = cpu_clock_event_start,
5152         .stop           = cpu_clock_event_stop,
5153         .read           = cpu_clock_event_read,
5154 };
5155
5156 /*
5157  * Software event: task time clock
5158  */
5159
5160 static void task_clock_event_update(struct perf_event *event, u64 now)
5161 {
5162         u64 prev;
5163         s64 delta;
5164
5165         prev = local64_xchg(&event->hw.prev_count, now);
5166         delta = now - prev;
5167         local64_add(delta, &event->count);
5168 }
5169
5170 static void task_clock_event_start(struct perf_event *event, int flags)
5171 {
5172         local64_set(&event->hw.prev_count, event->ctx->time);
5173         perf_swevent_start_hrtimer(event);
5174 }
5175
5176 static void task_clock_event_stop(struct perf_event *event, int flags)
5177 {
5178         perf_swevent_cancel_hrtimer(event);
5179         task_clock_event_update(event, event->ctx->time);
5180 }
5181
5182 static int task_clock_event_add(struct perf_event *event, int flags)
5183 {
5184         if (flags & PERF_EF_START)
5185                 task_clock_event_start(event, flags);
5186
5187         return 0;
5188 }
5189
5190 static void task_clock_event_del(struct perf_event *event, int flags)
5191 {
5192         task_clock_event_stop(event, PERF_EF_UPDATE);
5193 }
5194
5195 static void task_clock_event_read(struct perf_event *event)
5196 {
5197         u64 time;
5198
5199         if (!in_nmi()) {
5200                 update_context_time(event->ctx);
5201                 time = event->ctx->time;
5202         } else {
5203                 u64 now = perf_clock();
5204                 u64 delta = now - event->ctx->timestamp;
5205                 time = event->ctx->time + delta;
5206         }
5207
5208         task_clock_event_update(event, time);
5209 }
5210
5211 static int task_clock_event_init(struct perf_event *event)
5212 {
5213         if (event->attr.type != PERF_TYPE_SOFTWARE)
5214                 return -ENOENT;
5215
5216         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5217                 return -ENOENT;
5218
5219         return 0;
5220 }
5221
5222 static struct pmu perf_task_clock = {
5223         .task_ctx_nr    = perf_sw_context,
5224
5225         .event_init     = task_clock_event_init,
5226         .add            = task_clock_event_add,
5227         .del            = task_clock_event_del,
5228         .start          = task_clock_event_start,
5229         .stop           = task_clock_event_stop,
5230         .read           = task_clock_event_read,
5231 };
5232
5233 static void perf_pmu_nop_void(struct pmu *pmu)
5234 {
5235 }
5236
5237 static int perf_pmu_nop_int(struct pmu *pmu)
5238 {
5239         return 0;
5240 }
5241
5242 static void perf_pmu_start_txn(struct pmu *pmu)
5243 {
5244         perf_pmu_disable(pmu);
5245 }
5246
5247 static int perf_pmu_commit_txn(struct pmu *pmu)
5248 {
5249         perf_pmu_enable(pmu);
5250         return 0;
5251 }
5252
5253 static void perf_pmu_cancel_txn(struct pmu *pmu)
5254 {
5255         perf_pmu_enable(pmu);
5256 }
5257
5258 /*
5259  * Ensures all contexts with the same task_ctx_nr have the same
5260  * pmu_cpu_context too.
5261  */
5262 static void *find_pmu_context(int ctxn)
5263 {
5264         struct pmu *pmu;
5265
5266         if (ctxn < 0)
5267                 return NULL;
5268
5269         list_for_each_entry(pmu, &pmus, entry) {
5270                 if (pmu->task_ctx_nr == ctxn)
5271                         return pmu->pmu_cpu_context;
5272         }
5273
5274         return NULL;
5275 }
5276
5277 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5278 {
5279         int cpu;
5280
5281         for_each_possible_cpu(cpu) {
5282                 struct perf_cpu_context *cpuctx;
5283
5284                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5285
5286                 if (cpuctx->active_pmu == old_pmu)
5287                         cpuctx->active_pmu = pmu;
5288         }
5289 }
5290
5291 static void free_pmu_context(struct pmu *pmu)
5292 {
5293         struct pmu *i;
5294
5295         mutex_lock(&pmus_lock);
5296         /*
5297          * Like a real lame refcount.
5298          */
5299         list_for_each_entry(i, &pmus, entry) {
5300                 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5301                         update_pmu_context(i, pmu);
5302                         goto out;
5303                 }
5304         }
5305
5306         free_percpu(pmu->pmu_cpu_context);
5307 out:
5308         mutex_unlock(&pmus_lock);
5309 }
5310 static struct idr pmu_idr;
5311
5312 static ssize_t
5313 type_show(struct device *dev, struct device_attribute *attr, char *page)
5314 {
5315         struct pmu *pmu = dev_get_drvdata(dev);
5316
5317         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5318 }
5319
5320 static struct device_attribute pmu_dev_attrs[] = {
5321        __ATTR_RO(type),
5322        __ATTR_NULL,
5323 };
5324
5325 static int pmu_bus_running;
5326 static struct bus_type pmu_bus = {
5327         .name           = "event_source",
5328         .dev_attrs      = pmu_dev_attrs,
5329 };
5330
5331 static void pmu_dev_release(struct device *dev)
5332 {
5333         kfree(dev);
5334 }
5335
5336 static int pmu_dev_alloc(struct pmu *pmu)
5337 {
5338         int ret = -ENOMEM;
5339
5340         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5341         if (!pmu->dev)
5342                 goto out;
5343
5344         device_initialize(pmu->dev);
5345         ret = dev_set_name(pmu->dev, "%s", pmu->name);
5346         if (ret)
5347                 goto free_dev;
5348
5349         dev_set_drvdata(pmu->dev, pmu);
5350         pmu->dev->bus = &pmu_bus;
5351         pmu->dev->release = pmu_dev_release;
5352         ret = device_add(pmu->dev);
5353         if (ret)
5354                 goto free_dev;
5355
5356 out:
5357         return ret;
5358
5359 free_dev:
5360         put_device(pmu->dev);
5361         goto out;
5362 }
5363
5364 int perf_pmu_register(struct pmu *pmu, char *name, int type)
5365 {
5366         int cpu, ret;
5367
5368         mutex_lock(&pmus_lock);
5369         ret = -ENOMEM;
5370         pmu->pmu_disable_count = alloc_percpu(int);
5371         if (!pmu->pmu_disable_count)
5372                 goto unlock;
5373
5374         pmu->type = -1;
5375         if (!name)
5376                 goto skip_type;
5377         pmu->name = name;
5378
5379         if (type < 0) {
5380                 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5381                 if (!err)
5382                         goto free_pdc;
5383
5384                 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5385                 if (err) {
5386                         ret = err;
5387                         goto free_pdc;
5388                 }
5389         }
5390         pmu->type = type;
5391
5392         if (pmu_bus_running) {
5393                 ret = pmu_dev_alloc(pmu);
5394                 if (ret)
5395                         goto free_idr;
5396         }
5397
5398 skip_type:
5399         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5400         if (pmu->pmu_cpu_context)
5401                 goto got_cpu_context;
5402
5403         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5404         if (!pmu->pmu_cpu_context)
5405                 goto free_dev;
5406
5407         for_each_possible_cpu(cpu) {
5408                 struct perf_cpu_context *cpuctx;
5409
5410                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5411                 __perf_event_init_context(&cpuctx->ctx);
5412                 cpuctx->ctx.type = cpu_context;
5413                 cpuctx->ctx.pmu = pmu;
5414                 cpuctx->jiffies_interval = 1;
5415                 INIT_LIST_HEAD(&cpuctx->rotation_list);
5416                 cpuctx->active_pmu = pmu;
5417         }
5418
5419 got_cpu_context:
5420         if (!pmu->start_txn) {
5421                 if (pmu->pmu_enable) {
5422                         /*
5423                          * If we have pmu_enable/pmu_disable calls, install
5424                          * transaction stubs that use that to try and batch
5425                          * hardware accesses.
5426                          */
5427                         pmu->start_txn  = perf_pmu_start_txn;
5428                         pmu->commit_txn = perf_pmu_commit_txn;
5429                         pmu->cancel_txn = perf_pmu_cancel_txn;
5430                 } else {
5431                         pmu->start_txn  = perf_pmu_nop_void;
5432                         pmu->commit_txn = perf_pmu_nop_int;
5433                         pmu->cancel_txn = perf_pmu_nop_void;
5434                 }
5435         }
5436
5437         if (!pmu->pmu_enable) {
5438                 pmu->pmu_enable  = perf_pmu_nop_void;
5439                 pmu->pmu_disable = perf_pmu_nop_void;
5440         }
5441
5442         list_add_rcu(&pmu->entry, &pmus);
5443         ret = 0;
5444 unlock:
5445         mutex_unlock(&pmus_lock);
5446
5447         return ret;
5448
5449 free_dev:
5450         device_del(pmu->dev);
5451         put_device(pmu->dev);
5452
5453 free_idr:
5454         if (pmu->type >= PERF_TYPE_MAX)
5455                 idr_remove(&pmu_idr, pmu->type);
5456
5457 free_pdc:
5458         free_percpu(pmu->pmu_disable_count);
5459         goto unlock;
5460 }
5461
5462 void perf_pmu_unregister(struct pmu *pmu)
5463 {
5464         mutex_lock(&pmus_lock);
5465         list_del_rcu(&pmu->entry);
5466         mutex_unlock(&pmus_lock);
5467
5468         /*
5469          * We dereference the pmu list under both SRCU and regular RCU, so
5470          * synchronize against both of those.
5471          */
5472         synchronize_srcu(&pmus_srcu);
5473         synchronize_rcu();
5474
5475         free_percpu(pmu->pmu_disable_count);
5476         if (pmu->type >= PERF_TYPE_MAX)
5477                 idr_remove(&pmu_idr, pmu->type);
5478         device_del(pmu->dev);
5479         put_device(pmu->dev);
5480         free_pmu_context(pmu);
5481 }
5482
5483 struct pmu *perf_init_event(struct perf_event *event)
5484 {
5485         struct pmu *pmu = NULL;
5486         int idx;
5487
5488         idx = srcu_read_lock(&pmus_srcu);
5489
5490         rcu_read_lock();
5491         pmu = idr_find(&pmu_idr, event->attr.type);
5492         rcu_read_unlock();
5493         if (pmu)
5494                 goto unlock;
5495
5496         list_for_each_entry_rcu(pmu, &pmus, entry) {
5497                 int ret = pmu->event_init(event);
5498                 if (!ret)
5499                         goto unlock;
5500
5501                 if (ret != -ENOENT) {
5502                         pmu = ERR_PTR(ret);
5503                         goto unlock;
5504                 }
5505         }
5506         pmu = ERR_PTR(-ENOENT);
5507 unlock:
5508         srcu_read_unlock(&pmus_srcu, idx);
5509
5510         return pmu;
5511 }
5512
5513 /*
5514  * Allocate and initialize a event structure
5515  */
5516 static struct perf_event *
5517 perf_event_alloc(struct perf_event_attr *attr, int cpu,
5518                  struct task_struct *task,
5519                  struct perf_event *group_leader,
5520                  struct perf_event *parent_event,
5521                  perf_overflow_handler_t overflow_handler)
5522 {
5523         struct pmu *pmu;
5524         struct perf_event *event;
5525         struct hw_perf_event *hwc;
5526         long err;
5527
5528         event = kzalloc(sizeof(*event), GFP_KERNEL);
5529         if (!event)
5530                 return ERR_PTR(-ENOMEM);
5531
5532         /*
5533          * Single events are their own group leaders, with an
5534          * empty sibling list:
5535          */
5536         if (!group_leader)
5537                 group_leader = event;
5538
5539         mutex_init(&event->child_mutex);
5540         INIT_LIST_HEAD(&event->child_list);
5541
5542         INIT_LIST_HEAD(&event->group_entry);
5543         INIT_LIST_HEAD(&event->event_entry);
5544         INIT_LIST_HEAD(&event->sibling_list);
5545         init_waitqueue_head(&event->waitq);
5546         init_irq_work(&event->pending, perf_pending_event);
5547
5548         mutex_init(&event->mmap_mutex);
5549
5550         event->cpu              = cpu;
5551         event->attr             = *attr;
5552         event->group_leader     = group_leader;
5553         event->pmu              = NULL;
5554         event->oncpu            = -1;
5555
5556         event->parent           = parent_event;
5557
5558         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
5559         event->id               = atomic64_inc_return(&perf_event_id);
5560
5561         event->state            = PERF_EVENT_STATE_INACTIVE;
5562
5563         if (task) {
5564                 event->attach_state = PERF_ATTACH_TASK;
5565 #ifdef CONFIG_HAVE_HW_BREAKPOINT
5566                 /*
5567                  * hw_breakpoint is a bit difficult here..
5568                  */
5569                 if (attr->type == PERF_TYPE_BREAKPOINT)
5570                         event->hw.bp_target = task;
5571 #endif
5572         }
5573
5574         if (!overflow_handler && parent_event)
5575                 overflow_handler = parent_event->overflow_handler;
5576         
5577         event->overflow_handler = overflow_handler;
5578
5579         if (attr->disabled)
5580                 event->state = PERF_EVENT_STATE_OFF;
5581
5582         pmu = NULL;
5583
5584         hwc = &event->hw;
5585         hwc->sample_period = attr->sample_period;
5586         if (attr->freq && attr->sample_freq)
5587                 hwc->sample_period = 1;
5588         hwc->last_period = hwc->sample_period;
5589
5590         local64_set(&hwc->period_left, hwc->sample_period);
5591
5592         /*
5593          * we currently do not support PERF_FORMAT_GROUP on inherited events
5594          */
5595         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
5596                 goto done;
5597
5598         pmu = perf_init_event(event);
5599
5600 done:
5601         err = 0;
5602         if (!pmu)
5603                 err = -EINVAL;
5604         else if (IS_ERR(pmu))
5605                 err = PTR_ERR(pmu);
5606
5607         if (err) {
5608                 if (event->ns)
5609                         put_pid_ns(event->ns);
5610                 kfree(event);
5611                 return ERR_PTR(err);
5612         }
5613
5614         event->pmu = pmu;
5615
5616         if (!event->parent) {
5617                 if (event->attach_state & PERF_ATTACH_TASK)
5618                         jump_label_inc(&perf_task_events);
5619                 if (event->attr.mmap || event->attr.mmap_data)
5620                         atomic_inc(&nr_mmap_events);
5621                 if (event->attr.comm)
5622                         atomic_inc(&nr_comm_events);
5623                 if (event->attr.task)
5624                         atomic_inc(&nr_task_events);
5625                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5626                         err = get_callchain_buffers();
5627                         if (err) {
5628                                 free_event(event);
5629                                 return ERR_PTR(err);
5630                         }
5631                 }
5632         }
5633
5634         return event;
5635 }
5636
5637 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5638                           struct perf_event_attr *attr)
5639 {
5640         u32 size;
5641         int ret;
5642
5643         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
5644                 return -EFAULT;
5645
5646         /*
5647          * zero the full structure, so that a short copy will be nice.
5648          */
5649         memset(attr, 0, sizeof(*attr));
5650
5651         ret = get_user(size, &uattr->size);
5652         if (ret)
5653                 return ret;
5654
5655         if (size > PAGE_SIZE)   /* silly large */
5656                 goto err_size;
5657
5658         if (!size)              /* abi compat */
5659                 size = PERF_ATTR_SIZE_VER0;
5660
5661         if (size < PERF_ATTR_SIZE_VER0)
5662                 goto err_size;
5663
5664         /*
5665          * If we're handed a bigger struct than we know of,
5666          * ensure all the unknown bits are 0 - i.e. new
5667          * user-space does not rely on any kernel feature
5668          * extensions we dont know about yet.
5669          */
5670         if (size > sizeof(*attr)) {
5671                 unsigned char __user *addr;
5672                 unsigned char __user *end;
5673                 unsigned char val;
5674
5675                 addr = (void __user *)uattr + sizeof(*attr);
5676                 end  = (void __user *)uattr + size;
5677
5678                 for (; addr < end; addr++) {
5679                         ret = get_user(val, addr);
5680                         if (ret)
5681                                 return ret;
5682                         if (val)
5683                                 goto err_size;
5684                 }
5685                 size = sizeof(*attr);
5686         }
5687
5688         ret = copy_from_user(attr, uattr, size);
5689         if (ret)
5690                 return -EFAULT;
5691
5692         /*
5693          * If the type exists, the corresponding creation will verify
5694          * the attr->config.
5695          */
5696         if (attr->type >= PERF_TYPE_MAX)
5697                 return -EINVAL;
5698
5699         if (attr->__reserved_1)
5700                 return -EINVAL;
5701
5702         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
5703                 return -EINVAL;
5704
5705         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5706                 return -EINVAL;
5707
5708 out:
5709         return ret;
5710
5711 err_size:
5712         put_user(sizeof(*attr), &uattr->size);
5713         ret = -E2BIG;
5714         goto out;
5715 }
5716
5717 static int
5718 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5719 {
5720         struct perf_buffer *buffer = NULL, *old_buffer = NULL;
5721         int ret = -EINVAL;
5722
5723         if (!output_event)
5724                 goto set;
5725
5726         /* don't allow circular references */
5727         if (event == output_event)
5728                 goto out;
5729
5730         /*
5731          * Don't allow cross-cpu buffers
5732          */
5733         if (output_event->cpu != event->cpu)
5734                 goto out;
5735
5736         /*
5737          * If its not a per-cpu buffer, it must be the same task.
5738          */
5739         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
5740                 goto out;
5741
5742 set:
5743         mutex_lock(&event->mmap_mutex);
5744         /* Can't redirect output if we've got an active mmap() */
5745         if (atomic_read(&event->mmap_count))
5746                 goto unlock;
5747
5748         if (output_event) {
5749                 /* get the buffer we want to redirect to */
5750                 buffer = perf_buffer_get(output_event);
5751                 if (!buffer)
5752                         goto unlock;
5753         }
5754
5755         old_buffer = event->buffer;
5756         rcu_assign_pointer(event->buffer, buffer);
5757         ret = 0;
5758 unlock:
5759         mutex_unlock(&event->mmap_mutex);
5760
5761         if (old_buffer)
5762                 perf_buffer_put(old_buffer);
5763 out:
5764         return ret;
5765 }
5766
5767 /**
5768  * sys_perf_event_open - open a performance event, associate it to a task/cpu
5769  *
5770  * @attr_uptr:  event_id type attributes for monitoring/sampling
5771  * @pid:                target pid
5772  * @cpu:                target cpu
5773  * @group_fd:           group leader event fd
5774  */
5775 SYSCALL_DEFINE5(perf_event_open,
5776                 struct perf_event_attr __user *, attr_uptr,
5777                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5778 {
5779         struct perf_event *group_leader = NULL, *output_event = NULL;
5780         struct perf_event *event, *sibling;
5781         struct perf_event_attr attr;
5782         struct perf_event_context *ctx;
5783         struct file *event_file = NULL;
5784         struct file *group_file = NULL;
5785         struct task_struct *task = NULL;
5786         struct pmu *pmu;
5787         int event_fd;
5788         int move_group = 0;
5789         int fput_needed = 0;
5790         int err;
5791
5792         /* for future expandability... */
5793         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
5794                 return -EINVAL;
5795
5796         err = perf_copy_attr(attr_uptr, &attr);
5797         if (err)
5798                 return err;
5799
5800         if (!attr.exclude_kernel) {
5801                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
5802                         return -EACCES;
5803         }
5804
5805         if (attr.freq) {
5806                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
5807                         return -EINVAL;
5808         }
5809
5810         event_fd = get_unused_fd_flags(O_RDWR);
5811         if (event_fd < 0)
5812                 return event_fd;
5813
5814         if (group_fd != -1) {
5815                 group_leader = perf_fget_light(group_fd, &fput_needed);
5816                 if (IS_ERR(group_leader)) {
5817                         err = PTR_ERR(group_leader);
5818                         goto err_fd;
5819                 }
5820                 group_file = group_leader->filp;
5821                 if (flags & PERF_FLAG_FD_OUTPUT)
5822                         output_event = group_leader;
5823                 if (flags & PERF_FLAG_FD_NO_GROUP)
5824                         group_leader = NULL;
5825         }
5826
5827         if (pid != -1) {
5828                 task = find_lively_task_by_vpid(pid);
5829                 if (IS_ERR(task)) {
5830                         err = PTR_ERR(task);
5831                         goto err_group_fd;
5832                 }
5833         }
5834
5835         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5836         if (IS_ERR(event)) {
5837                 err = PTR_ERR(event);
5838                 goto err_task;
5839         }
5840
5841         /*
5842          * Special case software events and allow them to be part of
5843          * any hardware group.
5844          */
5845         pmu = event->pmu;
5846
5847         if (group_leader &&
5848             (is_software_event(event) != is_software_event(group_leader))) {
5849                 if (is_software_event(event)) {
5850                         /*
5851                          * If event and group_leader are not both a software
5852                          * event, and event is, then group leader is not.
5853                          *
5854                          * Allow the addition of software events to !software
5855                          * groups, this is safe because software events never
5856                          * fail to schedule.
5857                          */
5858                         pmu = group_leader->pmu;
5859                 } else if (is_software_event(group_leader) &&
5860                            (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5861                         /*
5862                          * In case the group is a pure software group, and we
5863                          * try to add a hardware event, move the whole group to
5864                          * the hardware context.
5865                          */
5866                         move_group = 1;
5867                 }
5868         }
5869
5870         /*
5871          * Get the target context (task or percpu):
5872          */
5873         ctx = find_get_context(pmu, task, cpu);
5874         if (IS_ERR(ctx)) {
5875                 err = PTR_ERR(ctx);
5876                 goto err_alloc;
5877         }
5878
5879         /*
5880          * Look up the group leader (we will attach this event to it):
5881          */
5882         if (group_leader) {
5883                 err = -EINVAL;
5884
5885                 /*
5886                  * Do not allow a recursive hierarchy (this new sibling
5887                  * becoming part of another group-sibling):
5888                  */
5889                 if (group_leader->group_leader != group_leader)
5890                         goto err_context;
5891                 /*
5892                  * Do not allow to attach to a group in a different
5893                  * task or CPU context:
5894                  */
5895                 if (move_group) {
5896                         if (group_leader->ctx->type != ctx->type)
5897                                 goto err_context;
5898                 } else {
5899                         if (group_leader->ctx != ctx)
5900                                 goto err_context;
5901                 }
5902
5903                 /*
5904                  * Only a group leader can be exclusive or pinned
5905                  */
5906                 if (attr.exclusive || attr.pinned)
5907                         goto err_context;
5908         }
5909
5910         if (output_event) {
5911                 err = perf_event_set_output(event, output_event);
5912                 if (err)
5913                         goto err_context;
5914         }
5915
5916         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5917         if (IS_ERR(event_file)) {
5918                 err = PTR_ERR(event_file);
5919                 goto err_context;
5920         }
5921
5922         if (move_group) {
5923                 struct perf_event_context *gctx = group_leader->ctx;
5924
5925                 mutex_lock(&gctx->mutex);
5926                 perf_event_remove_from_context(group_leader);
5927                 list_for_each_entry(sibling, &group_leader->sibling_list,
5928                                     group_entry) {
5929                         perf_event_remove_from_context(sibling);
5930                         put_ctx(gctx);
5931                 }
5932                 mutex_unlock(&gctx->mutex);
5933                 put_ctx(gctx);
5934         }
5935
5936         event->filp = event_file;
5937         WARN_ON_ONCE(ctx->parent_ctx);
5938         mutex_lock(&ctx->mutex);
5939
5940         if (move_group) {
5941                 perf_install_in_context(ctx, group_leader, cpu);
5942                 get_ctx(ctx);
5943                 list_for_each_entry(sibling, &group_leader->sibling_list,
5944                                     group_entry) {
5945                         perf_install_in_context(ctx, sibling, cpu);
5946                         get_ctx(ctx);
5947                 }
5948         }
5949
5950         perf_install_in_context(ctx, event, cpu);
5951         ++ctx->generation;
5952         mutex_unlock(&ctx->mutex);
5953
5954         event->owner = current;
5955
5956         mutex_lock(&current->perf_event_mutex);
5957         list_add_tail(&event->owner_entry, &current->perf_event_list);
5958         mutex_unlock(&current->perf_event_mutex);
5959
5960         /*
5961          * Precalculate sample_data sizes
5962          */
5963         perf_event__header_size(event);
5964         perf_event__id_header_size(event);
5965
5966         /*
5967          * Drop the reference on the group_event after placing the
5968          * new event on the sibling_list. This ensures destruction
5969          * of the group leader will find the pointer to itself in
5970          * perf_group_detach().
5971          */
5972         fput_light(group_file, fput_needed);
5973         fd_install(event_fd, event_file);
5974         return event_fd;
5975
5976 err_context:
5977         put_ctx(ctx);
5978 err_alloc:
5979         free_event(event);
5980 err_task:
5981         if (task)
5982                 put_task_struct(task);
5983 err_group_fd:
5984         fput_light(group_file, fput_needed);
5985 err_fd:
5986         put_unused_fd(event_fd);
5987         return err;
5988 }
5989
5990 /**
5991  * perf_event_create_kernel_counter
5992  *
5993  * @attr: attributes of the counter to create
5994  * @cpu: cpu in which the counter is bound
5995  * @task: task to profile (NULL for percpu)
5996  */
5997 struct perf_event *
5998 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5999                                  struct task_struct *task,
6000                                  perf_overflow_handler_t overflow_handler)
6001 {
6002         struct perf_event_context *ctx;
6003         struct perf_event *event;
6004         int err;
6005
6006         /*
6007          * Get the target context (task or percpu):
6008          */
6009
6010         event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
6011         if (IS_ERR(event)) {
6012                 err = PTR_ERR(event);
6013                 goto err;
6014         }
6015
6016         ctx = find_get_context(event->pmu, task, cpu);
6017         if (IS_ERR(ctx)) {
6018                 err = PTR_ERR(ctx);
6019                 goto err_free;
6020         }
6021
6022         event->filp = NULL;
6023         WARN_ON_ONCE(ctx->parent_ctx);
6024         mutex_lock(&ctx->mutex);
6025         perf_install_in_context(ctx, event, cpu);
6026         ++ctx->generation;
6027         mutex_unlock(&ctx->mutex);
6028
6029         return event;
6030
6031 err_free:
6032         free_event(event);
6033 err:
6034         return ERR_PTR(err);
6035 }
6036 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6037
6038 static void sync_child_event(struct perf_event *child_event,
6039                                struct task_struct *child)
6040 {
6041         struct perf_event *parent_event = child_event->parent;
6042         u64 child_val;
6043
6044         if (child_event->attr.inherit_stat)
6045                 perf_event_read_event(child_event, child);
6046
6047         child_val = perf_event_count(child_event);
6048
6049         /*
6050          * Add back the child's count to the parent's count:
6051          */
6052         atomic64_add(child_val, &parent_event->child_count);
6053         atomic64_add(child_event->total_time_enabled,
6054                      &parent_event->child_total_time_enabled);
6055         atomic64_add(child_event->total_time_running,
6056                      &parent_event->child_total_time_running);
6057
6058         /*
6059          * Remove this event from the parent's list
6060          */
6061         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6062         mutex_lock(&parent_event->child_mutex);
6063         list_del_init(&child_event->child_list);
6064         mutex_unlock(&parent_event->child_mutex);
6065
6066         /*
6067          * Release the parent event, if this was the last
6068          * reference to it.
6069          */
6070         fput(parent_event->filp);
6071 }
6072
6073 static void
6074 __perf_event_exit_task(struct perf_event *child_event,
6075                          struct perf_event_context *child_ctx,
6076                          struct task_struct *child)
6077 {
6078         struct perf_event *parent_event;
6079
6080         perf_event_remove_from_context(child_event);
6081
6082         parent_event = child_event->parent;
6083         /*
6084          * It can happen that parent exits first, and has events
6085          * that are still around due to the child reference. These
6086          * events need to be zapped - but otherwise linger.
6087          */
6088         if (parent_event) {
6089                 sync_child_event(child_event, child);
6090                 free_event(child_event);
6091         }
6092 }
6093
6094 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6095 {
6096         struct perf_event *child_event, *tmp;
6097         struct perf_event_context *child_ctx;
6098         unsigned long flags;
6099
6100         if (likely(!child->perf_event_ctxp[ctxn])) {
6101                 perf_event_task(child, NULL, 0);
6102                 return;
6103         }
6104
6105         local_irq_save(flags);
6106         /*
6107          * We can't reschedule here because interrupts are disabled,
6108          * and either child is current or it is a task that can't be
6109          * scheduled, so we are now safe from rescheduling changing
6110          * our context.
6111          */
6112         child_ctx = child->perf_event_ctxp[ctxn];
6113         task_ctx_sched_out(child_ctx, EVENT_ALL);
6114
6115         /*
6116          * Take the context lock here so that if find_get_context is
6117          * reading child->perf_event_ctxp, we wait until it has
6118          * incremented the context's refcount before we do put_ctx below.
6119          */
6120         raw_spin_lock(&child_ctx->lock);
6121         child->perf_event_ctxp[ctxn] = NULL;
6122         /*
6123          * If this context is a clone; unclone it so it can't get
6124          * swapped to another process while we're removing all
6125          * the events from it.
6126          */
6127         unclone_ctx(child_ctx);
6128         update_context_time(child_ctx);
6129         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6130
6131         /*
6132          * Report the task dead after unscheduling the events so that we
6133          * won't get any samples after PERF_RECORD_EXIT. We can however still
6134          * get a few PERF_RECORD_READ events.
6135          */
6136         perf_event_task(child, child_ctx, 0);
6137
6138         /*
6139          * We can recurse on the same lock type through:
6140          *
6141          *   __perf_event_exit_task()
6142          *     sync_child_event()
6143          *       fput(parent_event->filp)
6144          *         perf_release()
6145          *           mutex_lock(&ctx->mutex)
6146          *
6147          * But since its the parent context it won't be the same instance.
6148          */
6149         mutex_lock(&child_ctx->mutex);
6150
6151 again:
6152         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6153                                  group_entry)
6154                 __perf_event_exit_task(child_event, child_ctx, child);
6155
6156         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
6157                                  group_entry)
6158                 __perf_event_exit_task(child_event, child_ctx, child);
6159
6160         /*
6161          * If the last event was a group event, it will have appended all
6162          * its siblings to the list, but we obtained 'tmp' before that which
6163          * will still point to the list head terminating the iteration.
6164          */
6165         if (!list_empty(&child_ctx->pinned_groups) ||
6166             !list_empty(&child_ctx->flexible_groups))
6167                 goto again;
6168
6169         mutex_unlock(&child_ctx->mutex);
6170
6171         put_ctx(child_ctx);
6172 }
6173
6174 /*
6175  * When a child task exits, feed back event values to parent events.
6176  */
6177 void perf_event_exit_task(struct task_struct *child)
6178 {
6179         struct perf_event *event, *tmp;
6180         int ctxn;
6181
6182         mutex_lock(&child->perf_event_mutex);
6183         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6184                                  owner_entry) {
6185                 list_del_init(&event->owner_entry);
6186
6187                 /*
6188                  * Ensure the list deletion is visible before we clear
6189                  * the owner, closes a race against perf_release() where
6190                  * we need to serialize on the owner->perf_event_mutex.
6191                  */
6192                 smp_wmb();
6193                 event->owner = NULL;
6194         }
6195         mutex_unlock(&child->perf_event_mutex);
6196
6197         for_each_task_context_nr(ctxn)
6198                 perf_event_exit_task_context(child, ctxn);
6199 }
6200
6201 static void perf_free_event(struct perf_event *event,
6202                             struct perf_event_context *ctx)
6203 {
6204         struct perf_event *parent = event->parent;
6205
6206         if (WARN_ON_ONCE(!parent))
6207                 return;
6208
6209         mutex_lock(&parent->child_mutex);
6210         list_del_init(&event->child_list);
6211         mutex_unlock(&parent->child_mutex);
6212
6213         fput(parent->filp);
6214
6215         perf_group_detach(event);
6216         list_del_event(event, ctx);
6217         free_event(event);
6218 }
6219
6220 /*
6221  * free an unexposed, unused context as created by inheritance by
6222  * perf_event_init_task below, used by fork() in case of fail.
6223  */
6224 void perf_event_free_task(struct task_struct *task)
6225 {
6226         struct perf_event_context *ctx;
6227         struct perf_event *event, *tmp;
6228         int ctxn;
6229
6230         for_each_task_context_nr(ctxn) {
6231                 ctx = task->perf_event_ctxp[ctxn];
6232                 if (!ctx)
6233                         continue;
6234
6235                 mutex_lock(&ctx->mutex);
6236 again:
6237                 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6238                                 group_entry)
6239                         perf_free_event(event, ctx);
6240
6241                 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6242                                 group_entry)
6243                         perf_free_event(event, ctx);
6244
6245                 if (!list_empty(&ctx->pinned_groups) ||
6246                                 !list_empty(&ctx->flexible_groups))
6247                         goto again;
6248
6249                 mutex_unlock(&ctx->mutex);
6250
6251                 put_ctx(ctx);
6252         }
6253 }
6254
6255 void perf_event_delayed_put(struct task_struct *task)
6256 {
6257         int ctxn;
6258
6259         for_each_task_context_nr(ctxn)
6260                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6261 }
6262
6263 /*
6264  * inherit a event from parent task to child task:
6265  */
6266 static struct perf_event *
6267 inherit_event(struct perf_event *parent_event,
6268               struct task_struct *parent,
6269               struct perf_event_context *parent_ctx,
6270               struct task_struct *child,
6271               struct perf_event *group_leader,
6272               struct perf_event_context *child_ctx)
6273 {
6274         struct perf_event *child_event;
6275         unsigned long flags;
6276
6277         /*
6278          * Instead of creating recursive hierarchies of events,
6279          * we link inherited events back to the original parent,
6280          * which has a filp for sure, which we use as the reference
6281          * count:
6282          */
6283         if (parent_event->parent)
6284                 parent_event = parent_event->parent;
6285
6286         child_event = perf_event_alloc(&parent_event->attr,
6287                                            parent_event->cpu,
6288                                            child,
6289                                            group_leader, parent_event,
6290                                            NULL);
6291         if (IS_ERR(child_event))
6292                 return child_event;
6293         get_ctx(child_ctx);
6294
6295         /*
6296          * Make the child state follow the state of the parent event,
6297          * not its attr.disabled bit.  We hold the parent's mutex,
6298          * so we won't race with perf_event_{en, dis}able_family.
6299          */
6300         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6301                 child_event->state = PERF_EVENT_STATE_INACTIVE;
6302         else
6303                 child_event->state = PERF_EVENT_STATE_OFF;
6304
6305         if (parent_event->attr.freq) {
6306                 u64 sample_period = parent_event->hw.sample_period;
6307                 struct hw_perf_event *hwc = &child_event->hw;
6308
6309                 hwc->sample_period = sample_period;
6310                 hwc->last_period   = sample_period;
6311
6312                 local64_set(&hwc->period_left, sample_period);
6313         }
6314
6315         child_event->ctx = child_ctx;
6316         child_event->overflow_handler = parent_event->overflow_handler;
6317
6318         /*
6319          * Precalculate sample_data sizes
6320          */
6321         perf_event__header_size(child_event);
6322         perf_event__id_header_size(child_event);
6323
6324         /*
6325          * Link it up in the child's context:
6326          */
6327         raw_spin_lock_irqsave(&child_ctx->lock, flags);
6328         add_event_to_ctx(child_event, child_ctx);
6329         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6330
6331         /*
6332          * Get a reference to the parent filp - we will fput it
6333          * when the child event exits. This is safe to do because
6334          * we are in the parent and we know that the filp still
6335          * exists and has a nonzero count:
6336          */
6337         atomic_long_inc(&parent_event->filp->f_count);
6338
6339         /*
6340          * Link this into the parent event's child list
6341          */
6342         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6343         mutex_lock(&parent_event->child_mutex);
6344         list_add_tail(&child_event->child_list, &parent_event->child_list);
6345         mutex_unlock(&parent_event->child_mutex);
6346
6347         return child_event;
6348 }
6349
6350 static int inherit_group(struct perf_event *parent_event,
6351               struct task_struct *parent,
6352               struct perf_event_context *parent_ctx,
6353               struct task_struct *child,
6354               struct perf_event_context *child_ctx)
6355 {
6356         struct perf_event *leader;
6357         struct perf_event *sub;
6358         struct perf_event *child_ctr;
6359
6360         leader = inherit_event(parent_event, parent, parent_ctx,
6361                                  child, NULL, child_ctx);
6362         if (IS_ERR(leader))
6363                 return PTR_ERR(leader);
6364         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6365                 child_ctr = inherit_event(sub, parent, parent_ctx,
6366                                             child, leader, child_ctx);
6367                 if (IS_ERR(child_ctr))
6368                         return PTR_ERR(child_ctr);
6369         }
6370         return 0;
6371 }
6372
6373 static int
6374 inherit_task_group(struct perf_event *event, struct task_struct *parent,
6375                    struct perf_event_context *parent_ctx,
6376                    struct task_struct *child, int ctxn,
6377                    int *inherited_all)
6378 {
6379         int ret;
6380         struct perf_event_context *child_ctx;
6381
6382         if (!event->attr.inherit) {
6383                 *inherited_all = 0;
6384                 return 0;
6385         }
6386
6387         child_ctx = child->perf_event_ctxp[ctxn];
6388         if (!child_ctx) {
6389                 /*
6390                  * This is executed from the parent task context, so
6391                  * inherit events that have been marked for cloning.
6392                  * First allocate and initialize a context for the
6393                  * child.
6394                  */
6395
6396                 child_ctx = alloc_perf_context(event->pmu, child);
6397                 if (!child_ctx)
6398                         return -ENOMEM;
6399
6400                 child->perf_event_ctxp[ctxn] = child_ctx;
6401         }
6402
6403         ret = inherit_group(event, parent, parent_ctx,
6404                             child, child_ctx);
6405
6406         if (ret)
6407                 *inherited_all = 0;
6408
6409         return ret;
6410 }
6411
6412 /*
6413  * Initialize the perf_event context in task_struct
6414  */
6415 int perf_event_init_context(struct task_struct *child, int ctxn)
6416 {
6417         struct perf_event_context *child_ctx, *parent_ctx;
6418         struct perf_event_context *cloned_ctx;
6419         struct perf_event *event;
6420         struct task_struct *parent = current;
6421         int inherited_all = 1;
6422         unsigned long flags;
6423         int ret = 0;
6424
6425         child->perf_event_ctxp[ctxn] = NULL;
6426
6427         mutex_init(&child->perf_event_mutex);
6428         INIT_LIST_HEAD(&child->perf_event_list);
6429
6430         if (likely(!parent->perf_event_ctxp[ctxn]))
6431                 return 0;
6432
6433         /*
6434          * If the parent's context is a clone, pin it so it won't get
6435          * swapped under us.
6436          */
6437         parent_ctx = perf_pin_task_context(parent, ctxn);
6438
6439         /*
6440          * No need to check if parent_ctx != NULL here; since we saw
6441          * it non-NULL earlier, the only reason for it to become NULL
6442          * is if we exit, and since we're currently in the middle of
6443          * a fork we can't be exiting at the same time.
6444          */
6445
6446         /*
6447          * Lock the parent list. No need to lock the child - not PID
6448          * hashed yet and not running, so nobody can access it.
6449          */
6450         mutex_lock(&parent_ctx->mutex);
6451
6452         /*
6453          * We dont have to disable NMIs - we are only looking at
6454          * the list, not manipulating it:
6455          */
6456         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
6457                 ret = inherit_task_group(event, parent, parent_ctx,
6458                                          child, ctxn, &inherited_all);
6459                 if (ret)
6460                         break;
6461         }
6462
6463         /*
6464          * We can't hold ctx->lock when iterating the ->flexible_group list due
6465          * to allocations, but we need to prevent rotation because
6466          * rotate_ctx() will change the list from interrupt context.
6467          */
6468         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6469         parent_ctx->rotate_disable = 1;
6470         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6471
6472         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6473                 ret = inherit_task_group(event, parent, parent_ctx,
6474                                          child, ctxn, &inherited_all);
6475                 if (ret)
6476                         break;
6477         }
6478
6479         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6480         parent_ctx->rotate_disable = 0;
6481         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6482
6483         child_ctx = child->perf_event_ctxp[ctxn];
6484
6485         if (child_ctx && inherited_all) {
6486                 /*
6487                  * Mark the child context as a clone of the parent
6488                  * context, or of whatever the parent is a clone of.
6489                  * Note that if the parent is a clone, it could get
6490                  * uncloned at any point, but that doesn't matter
6491                  * because the list of events and the generation
6492                  * count can't have changed since we took the mutex.
6493                  */
6494                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
6495                 if (cloned_ctx) {
6496                         child_ctx->parent_ctx = cloned_ctx;
6497                         child_ctx->parent_gen = parent_ctx->parent_gen;
6498                 } else {
6499                         child_ctx->parent_ctx = parent_ctx;
6500                         child_ctx->parent_gen = parent_ctx->generation;
6501                 }
6502                 get_ctx(child_ctx->parent_ctx);
6503         }
6504
6505         mutex_unlock(&parent_ctx->mutex);
6506
6507         perf_unpin_context(parent_ctx);
6508
6509         return ret;
6510 }
6511
6512 /*
6513  * Initialize the perf_event context in task_struct
6514  */
6515 int perf_event_init_task(struct task_struct *child)
6516 {
6517         int ctxn, ret;
6518
6519         for_each_task_context_nr(ctxn) {
6520                 ret = perf_event_init_context(child, ctxn);
6521                 if (ret)
6522                         return ret;
6523         }
6524
6525         return 0;
6526 }
6527
6528 static void __init perf_event_init_all_cpus(void)
6529 {
6530         struct swevent_htable *swhash;
6531         int cpu;
6532
6533         for_each_possible_cpu(cpu) {
6534                 swhash = &per_cpu(swevent_htable, cpu);
6535                 mutex_init(&swhash->hlist_mutex);
6536                 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
6537         }
6538 }
6539
6540 static void __cpuinit perf_event_init_cpu(int cpu)
6541 {
6542         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6543
6544         mutex_lock(&swhash->hlist_mutex);
6545         if (swhash->hlist_refcount > 0) {
6546                 struct swevent_hlist *hlist;
6547
6548                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
6549                 WARN_ON(!hlist);
6550                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6551         }
6552         mutex_unlock(&swhash->hlist_mutex);
6553 }
6554
6555 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6556 static void perf_pmu_rotate_stop(struct pmu *pmu)
6557 {
6558         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6559
6560         WARN_ON(!irqs_disabled());
6561
6562         list_del_init(&cpuctx->rotation_list);
6563 }
6564
6565 static void __perf_event_exit_context(void *__info)
6566 {
6567         struct perf_event_context *ctx = __info;
6568         struct perf_event *event, *tmp;
6569
6570         perf_pmu_rotate_stop(ctx->pmu);
6571
6572         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6573                 __perf_event_remove_from_context(event);
6574         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6575                 __perf_event_remove_from_context(event);
6576 }
6577
6578 static void perf_event_exit_cpu_context(int cpu)
6579 {
6580         struct perf_event_context *ctx;
6581         struct pmu *pmu;
6582         int idx;
6583
6584         idx = srcu_read_lock(&pmus_srcu);
6585         list_for_each_entry_rcu(pmu, &pmus, entry) {
6586                 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6587
6588                 mutex_lock(&ctx->mutex);
6589                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6590                 mutex_unlock(&ctx->mutex);
6591         }
6592         srcu_read_unlock(&pmus_srcu, idx);
6593 }
6594
6595 static void perf_event_exit_cpu(int cpu)
6596 {
6597         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6598
6599         mutex_lock(&swhash->hlist_mutex);
6600         swevent_hlist_release(swhash);
6601         mutex_unlock(&swhash->hlist_mutex);
6602
6603         perf_event_exit_cpu_context(cpu);
6604 }
6605 #else
6606 static inline void perf_event_exit_cpu(int cpu) { }
6607 #endif
6608
6609 static int
6610 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6611 {
6612         int cpu;
6613
6614         for_each_online_cpu(cpu)
6615                 perf_event_exit_cpu(cpu);
6616
6617         return NOTIFY_OK;
6618 }
6619
6620 /*
6621  * Run the perf reboot notifier at the very last possible moment so that
6622  * the generic watchdog code runs as long as possible.
6623  */
6624 static struct notifier_block perf_reboot_notifier = {
6625         .notifier_call = perf_reboot,
6626         .priority = INT_MIN,
6627 };
6628
6629 static int __cpuinit
6630 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6631 {
6632         unsigned int cpu = (long)hcpu;
6633
6634         switch (action & ~CPU_TASKS_FROZEN) {
6635
6636         case CPU_UP_PREPARE:
6637         case CPU_DOWN_FAILED:
6638                 perf_event_init_cpu(cpu);
6639                 break;
6640
6641         case CPU_UP_CANCELED:
6642         case CPU_DOWN_PREPARE:
6643                 perf_event_exit_cpu(cpu);
6644                 break;
6645
6646         default:
6647                 break;
6648         }
6649
6650         return NOTIFY_OK;
6651 }
6652
6653 void __init perf_event_init(void)
6654 {
6655         int ret;
6656
6657         idr_init(&pmu_idr);
6658
6659         perf_event_init_all_cpus();
6660         init_srcu_struct(&pmus_srcu);
6661         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6662         perf_pmu_register(&perf_cpu_clock, NULL, -1);
6663         perf_pmu_register(&perf_task_clock, NULL, -1);
6664         perf_tp_register();
6665         perf_cpu_notifier(perf_cpu_notify);
6666         register_reboot_notifier(&perf_reboot_notifier);
6667
6668         ret = init_hw_breakpoint();
6669         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6670 }
6671
6672 static int __init perf_event_sysfs_init(void)
6673 {
6674         struct pmu *pmu;
6675         int ret;
6676
6677         mutex_lock(&pmus_lock);
6678
6679         ret = bus_register(&pmu_bus);
6680         if (ret)
6681                 goto unlock;
6682
6683         list_for_each_entry(pmu, &pmus, entry) {
6684                 if (!pmu->name || pmu->type < 0)
6685                         continue;
6686
6687                 ret = pmu_dev_alloc(pmu);
6688                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6689         }
6690         pmu_bus_running = 1;
6691         ret = 0;
6692
6693 unlock:
6694         mutex_unlock(&pmus_lock);
6695
6696         return ret;
6697 }
6698 device_initcall(perf_event_sysfs_init);