Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[linux-2.6.git] / kernel / trace / trace_syscalls.c
1 #include <trace/syscall.h>
2 #include <trace/events/syscalls.h>
3 #include <linux/slab.h>
4 #include <linux/kernel.h>
5 #include <linux/ftrace.h>
6 #include <linux/perf_event.h>
7 #include <asm/syscall.h>
8
9 #include "trace_output.h"
10 #include "trace.h"
11
12 static DEFINE_MUTEX(syscall_trace_lock);
13 static int sys_refcount_enter;
14 static int sys_refcount_exit;
15 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17
18 static int syscall_enter_register(struct ftrace_event_call *event,
19                                  enum trace_reg type);
20 static int syscall_exit_register(struct ftrace_event_call *event,
21                                  enum trace_reg type);
22
23 static int syscall_enter_define_fields(struct ftrace_event_call *call);
24 static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26 static struct list_head *
27 syscall_get_enter_fields(struct ftrace_event_call *call)
28 {
29         struct syscall_metadata *entry = call->data;
30
31         return &entry->enter_fields;
32 }
33
34 struct trace_event_functions enter_syscall_print_funcs = {
35         .trace          = print_syscall_enter,
36 };
37
38 struct trace_event_functions exit_syscall_print_funcs = {
39         .trace          = print_syscall_exit,
40 };
41
42 struct ftrace_event_class event_class_syscall_enter = {
43         .system         = "syscalls",
44         .reg            = syscall_enter_register,
45         .define_fields  = syscall_enter_define_fields,
46         .get_fields     = syscall_get_enter_fields,
47         .raw_init       = init_syscall_trace,
48 };
49
50 struct ftrace_event_class event_class_syscall_exit = {
51         .system         = "syscalls",
52         .reg            = syscall_exit_register,
53         .define_fields  = syscall_exit_define_fields,
54         .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
55         .raw_init       = init_syscall_trace,
56 };
57
58 extern struct syscall_metadata *__start_syscalls_metadata[];
59 extern struct syscall_metadata *__stop_syscalls_metadata[];
60
61 static struct syscall_metadata **syscalls_metadata;
62
63 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65 {
66         /*
67          * Only compare after the "sys" prefix. Archs that use
68          * syscall wrappers may have syscalls symbols aliases prefixed
69          * with "SyS" instead of "sys", leading to an unwanted
70          * mismatch.
71          */
72         return !strcmp(sym + 3, name + 3);
73 }
74 #endif
75
76 static __init struct syscall_metadata *
77 find_syscall_meta(unsigned long syscall)
78 {
79         struct syscall_metadata **start;
80         struct syscall_metadata **stop;
81         char str[KSYM_SYMBOL_LEN];
82
83
84         start = __start_syscalls_metadata;
85         stop = __stop_syscalls_metadata;
86         kallsyms_lookup(syscall, NULL, NULL, NULL, str);
87
88         if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89                 return NULL;
90
91         for ( ; start < stop; start++) {
92                 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
93                         return *start;
94         }
95         return NULL;
96 }
97
98 static struct syscall_metadata *syscall_nr_to_meta(int nr)
99 {
100         if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
101                 return NULL;
102
103         return syscalls_metadata[nr];
104 }
105
106 enum print_line_t
107 print_syscall_enter(struct trace_iterator *iter, int flags,
108                     struct trace_event *event)
109 {
110         struct trace_seq *s = &iter->seq;
111         struct trace_entry *ent = iter->ent;
112         struct syscall_trace_enter *trace;
113         struct syscall_metadata *entry;
114         int i, ret, syscall;
115
116         trace = (typeof(trace))ent;
117         syscall = trace->nr;
118         entry = syscall_nr_to_meta(syscall);
119
120         if (!entry)
121                 goto end;
122
123         if (entry->enter_event->event.type != ent->type) {
124                 WARN_ON_ONCE(1);
125                 goto end;
126         }
127
128         ret = trace_seq_printf(s, "%s(", entry->name);
129         if (!ret)
130                 return TRACE_TYPE_PARTIAL_LINE;
131
132         for (i = 0; i < entry->nb_args; i++) {
133                 /* parameter types */
134                 if (trace_flags & TRACE_ITER_VERBOSE) {
135                         ret = trace_seq_printf(s, "%s ", entry->types[i]);
136                         if (!ret)
137                                 return TRACE_TYPE_PARTIAL_LINE;
138                 }
139                 /* parameter values */
140                 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
141                                        trace->args[i],
142                                        i == entry->nb_args - 1 ? "" : ", ");
143                 if (!ret)
144                         return TRACE_TYPE_PARTIAL_LINE;
145         }
146
147         ret = trace_seq_putc(s, ')');
148         if (!ret)
149                 return TRACE_TYPE_PARTIAL_LINE;
150
151 end:
152         ret =  trace_seq_putc(s, '\n');
153         if (!ret)
154                 return TRACE_TYPE_PARTIAL_LINE;
155
156         return TRACE_TYPE_HANDLED;
157 }
158
159 enum print_line_t
160 print_syscall_exit(struct trace_iterator *iter, int flags,
161                    struct trace_event *event)
162 {
163         struct trace_seq *s = &iter->seq;
164         struct trace_entry *ent = iter->ent;
165         struct syscall_trace_exit *trace;
166         int syscall;
167         struct syscall_metadata *entry;
168         int ret;
169
170         trace = (typeof(trace))ent;
171         syscall = trace->nr;
172         entry = syscall_nr_to_meta(syscall);
173
174         if (!entry) {
175                 trace_seq_printf(s, "\n");
176                 return TRACE_TYPE_HANDLED;
177         }
178
179         if (entry->exit_event->event.type != ent->type) {
180                 WARN_ON_ONCE(1);
181                 return TRACE_TYPE_UNHANDLED;
182         }
183
184         ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
185                                 trace->ret);
186         if (!ret)
187                 return TRACE_TYPE_PARTIAL_LINE;
188
189         return TRACE_TYPE_HANDLED;
190 }
191
192 extern char *__bad_type_size(void);
193
194 #define SYSCALL_FIELD(type, name)                                       \
195         sizeof(type) != sizeof(trace.name) ?                            \
196                 __bad_type_size() :                                     \
197                 #type, #name, offsetof(typeof(trace), name),            \
198                 sizeof(trace.name), is_signed_type(type)
199
200 static
201 int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
202 {
203         int i;
204         int pos = 0;
205
206         /* When len=0, we just calculate the needed length */
207 #define LEN_OR_ZERO (len ? len - pos : 0)
208
209         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
210         for (i = 0; i < entry->nb_args; i++) {
211                 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
212                                 entry->args[i], sizeof(unsigned long),
213                                 i == entry->nb_args - 1 ? "" : ", ");
214         }
215         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
216
217         for (i = 0; i < entry->nb_args; i++) {
218                 pos += snprintf(buf + pos, LEN_OR_ZERO,
219                                 ", ((unsigned long)(REC->%s))", entry->args[i]);
220         }
221
222 #undef LEN_OR_ZERO
223
224         /* return the length of print_fmt */
225         return pos;
226 }
227
228 static int set_syscall_print_fmt(struct ftrace_event_call *call)
229 {
230         char *print_fmt;
231         int len;
232         struct syscall_metadata *entry = call->data;
233
234         if (entry->enter_event != call) {
235                 call->print_fmt = "\"0x%lx\", REC->ret";
236                 return 0;
237         }
238
239         /* First: called with 0 length to calculate the needed length */
240         len = __set_enter_print_fmt(entry, NULL, 0);
241
242         print_fmt = kmalloc(len + 1, GFP_KERNEL);
243         if (!print_fmt)
244                 return -ENOMEM;
245
246         /* Second: actually write the @print_fmt */
247         __set_enter_print_fmt(entry, print_fmt, len + 1);
248         call->print_fmt = print_fmt;
249
250         return 0;
251 }
252
253 static void free_syscall_print_fmt(struct ftrace_event_call *call)
254 {
255         struct syscall_metadata *entry = call->data;
256
257         if (entry->enter_event == call)
258                 kfree(call->print_fmt);
259 }
260
261 static int syscall_enter_define_fields(struct ftrace_event_call *call)
262 {
263         struct syscall_trace_enter trace;
264         struct syscall_metadata *meta = call->data;
265         int ret;
266         int i;
267         int offset = offsetof(typeof(trace), args);
268
269         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
270         if (ret)
271                 return ret;
272
273         for (i = 0; i < meta->nb_args; i++) {
274                 ret = trace_define_field(call, meta->types[i],
275                                          meta->args[i], offset,
276                                          sizeof(unsigned long), 0,
277                                          FILTER_OTHER);
278                 offset += sizeof(unsigned long);
279         }
280
281         return ret;
282 }
283
284 static int syscall_exit_define_fields(struct ftrace_event_call *call)
285 {
286         struct syscall_trace_exit trace;
287         int ret;
288
289         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
290         if (ret)
291                 return ret;
292
293         ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
294                                  FILTER_OTHER);
295
296         return ret;
297 }
298
299 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
300 {
301         struct syscall_trace_enter *entry;
302         struct syscall_metadata *sys_data;
303         struct ring_buffer_event *event;
304         struct ring_buffer *buffer;
305         int size;
306         int syscall_nr;
307
308         syscall_nr = syscall_get_nr(current, regs);
309         if (syscall_nr < 0)
310                 return;
311         if (!test_bit(syscall_nr, enabled_enter_syscalls))
312                 return;
313
314         sys_data = syscall_nr_to_meta(syscall_nr);
315         if (!sys_data)
316                 return;
317
318         size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
319
320         event = trace_current_buffer_lock_reserve(&buffer,
321                         sys_data->enter_event->event.type, size, 0, 0);
322         if (!event)
323                 return;
324
325         entry = ring_buffer_event_data(event);
326         entry->nr = syscall_nr;
327         syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
328
329         if (!filter_current_check_discard(buffer, sys_data->enter_event,
330                                           entry, event))
331                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
332 }
333
334 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
335 {
336         struct syscall_trace_exit *entry;
337         struct syscall_metadata *sys_data;
338         struct ring_buffer_event *event;
339         struct ring_buffer *buffer;
340         int syscall_nr;
341
342         syscall_nr = syscall_get_nr(current, regs);
343         if (syscall_nr < 0)
344                 return;
345         if (!test_bit(syscall_nr, enabled_exit_syscalls))
346                 return;
347
348         sys_data = syscall_nr_to_meta(syscall_nr);
349         if (!sys_data)
350                 return;
351
352         event = trace_current_buffer_lock_reserve(&buffer,
353                         sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
354         if (!event)
355                 return;
356
357         entry = ring_buffer_event_data(event);
358         entry->nr = syscall_nr;
359         entry->ret = syscall_get_return_value(current, regs);
360
361         if (!filter_current_check_discard(buffer, sys_data->exit_event,
362                                           entry, event))
363                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
364 }
365
366 int reg_event_syscall_enter(struct ftrace_event_call *call)
367 {
368         int ret = 0;
369         int num;
370
371         num = ((struct syscall_metadata *)call->data)->syscall_nr;
372         if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
373                 return -ENOSYS;
374         mutex_lock(&syscall_trace_lock);
375         if (!sys_refcount_enter)
376                 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
377         if (!ret) {
378                 set_bit(num, enabled_enter_syscalls);
379                 sys_refcount_enter++;
380         }
381         mutex_unlock(&syscall_trace_lock);
382         return ret;
383 }
384
385 void unreg_event_syscall_enter(struct ftrace_event_call *call)
386 {
387         int num;
388
389         num = ((struct syscall_metadata *)call->data)->syscall_nr;
390         if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
391                 return;
392         mutex_lock(&syscall_trace_lock);
393         sys_refcount_enter--;
394         clear_bit(num, enabled_enter_syscalls);
395         if (!sys_refcount_enter)
396                 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
397         mutex_unlock(&syscall_trace_lock);
398 }
399
400 int reg_event_syscall_exit(struct ftrace_event_call *call)
401 {
402         int ret = 0;
403         int num;
404
405         num = ((struct syscall_metadata *)call->data)->syscall_nr;
406         if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
407                 return -ENOSYS;
408         mutex_lock(&syscall_trace_lock);
409         if (!sys_refcount_exit)
410                 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
411         if (!ret) {
412                 set_bit(num, enabled_exit_syscalls);
413                 sys_refcount_exit++;
414         }
415         mutex_unlock(&syscall_trace_lock);
416         return ret;
417 }
418
419 void unreg_event_syscall_exit(struct ftrace_event_call *call)
420 {
421         int num;
422
423         num = ((struct syscall_metadata *)call->data)->syscall_nr;
424         if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
425                 return;
426         mutex_lock(&syscall_trace_lock);
427         sys_refcount_exit--;
428         clear_bit(num, enabled_exit_syscalls);
429         if (!sys_refcount_exit)
430                 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
431         mutex_unlock(&syscall_trace_lock);
432 }
433
434 int init_syscall_trace(struct ftrace_event_call *call)
435 {
436         int id;
437         int num;
438
439         num = ((struct syscall_metadata *)call->data)->syscall_nr;
440         if (num < 0 || num >= NR_syscalls) {
441                 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442                                 ((struct syscall_metadata *)call->data)->name);
443                 return -ENOSYS;
444         }
445
446         if (set_syscall_print_fmt(call) < 0)
447                 return -ENOMEM;
448
449         id = trace_event_raw_init(call);
450
451         if (id < 0) {
452                 free_syscall_print_fmt(call);
453                 return id;
454         }
455
456         return id;
457 }
458
459 unsigned long __init __weak arch_syscall_addr(int nr)
460 {
461         return (unsigned long)sys_call_table[nr];
462 }
463
464 int __init init_ftrace_syscalls(void)
465 {
466         struct syscall_metadata *meta;
467         unsigned long addr;
468         int i;
469
470         syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
471                                         NR_syscalls, GFP_KERNEL);
472         if (!syscalls_metadata) {
473                 WARN_ON(1);
474                 return -ENOMEM;
475         }
476
477         for (i = 0; i < NR_syscalls; i++) {
478                 addr = arch_syscall_addr(i);
479                 meta = find_syscall_meta(addr);
480                 if (!meta)
481                         continue;
482
483                 meta->syscall_nr = i;
484                 syscalls_metadata[i] = meta;
485         }
486
487         return 0;
488 }
489 core_initcall(init_ftrace_syscalls);
490
491 #ifdef CONFIG_PERF_EVENTS
492
493 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
494 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
495 static int sys_perf_refcount_enter;
496 static int sys_perf_refcount_exit;
497
498 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
499 {
500         struct syscall_metadata *sys_data;
501         struct syscall_trace_enter *rec;
502         struct hlist_head *head;
503         int syscall_nr;
504         int rctx;
505         int size;
506
507         syscall_nr = syscall_get_nr(current, regs);
508         if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
509                 return;
510
511         sys_data = syscall_nr_to_meta(syscall_nr);
512         if (!sys_data)
513                 return;
514
515         /* get the size after alignment with the u32 buffer size field */
516         size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
517         size = ALIGN(size + sizeof(u32), sizeof(u64));
518         size -= sizeof(u32);
519
520         if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
521                       "perf buffer not large enough"))
522                 return;
523
524         rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
525                                 sys_data->enter_event->event.type, regs, &rctx);
526         if (!rec)
527                 return;
528
529         rec->nr = syscall_nr;
530         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
531                                (unsigned long *)&rec->args);
532
533         head = this_cpu_ptr(sys_data->enter_event->perf_events);
534         perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
535 }
536
537 int perf_sysenter_enable(struct ftrace_event_call *call)
538 {
539         int ret = 0;
540         int num;
541
542         num = ((struct syscall_metadata *)call->data)->syscall_nr;
543
544         mutex_lock(&syscall_trace_lock);
545         if (!sys_perf_refcount_enter)
546                 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
547         if (ret) {
548                 pr_info("event trace: Could not activate"
549                                 "syscall entry trace point");
550         } else {
551                 set_bit(num, enabled_perf_enter_syscalls);
552                 sys_perf_refcount_enter++;
553         }
554         mutex_unlock(&syscall_trace_lock);
555         return ret;
556 }
557
558 void perf_sysenter_disable(struct ftrace_event_call *call)
559 {
560         int num;
561
562         num = ((struct syscall_metadata *)call->data)->syscall_nr;
563
564         mutex_lock(&syscall_trace_lock);
565         sys_perf_refcount_enter--;
566         clear_bit(num, enabled_perf_enter_syscalls);
567         if (!sys_perf_refcount_enter)
568                 unregister_trace_sys_enter(perf_syscall_enter, NULL);
569         mutex_unlock(&syscall_trace_lock);
570 }
571
572 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
573 {
574         struct syscall_metadata *sys_data;
575         struct syscall_trace_exit *rec;
576         struct hlist_head *head;
577         int syscall_nr;
578         int rctx;
579         int size;
580
581         syscall_nr = syscall_get_nr(current, regs);
582         if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
583                 return;
584
585         sys_data = syscall_nr_to_meta(syscall_nr);
586         if (!sys_data)
587                 return;
588
589         /* We can probably do that at build time */
590         size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
591         size -= sizeof(u32);
592
593         /*
594          * Impossible, but be paranoid with the future
595          * How to put this check outside runtime?
596          */
597         if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
598                 "exit event has grown above perf buffer size"))
599                 return;
600
601         rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
602                                 sys_data->exit_event->event.type, regs, &rctx);
603         if (!rec)
604                 return;
605
606         rec->nr = syscall_nr;
607         rec->ret = syscall_get_return_value(current, regs);
608
609         head = this_cpu_ptr(sys_data->exit_event->perf_events);
610         perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
611 }
612
613 int perf_sysexit_enable(struct ftrace_event_call *call)
614 {
615         int ret = 0;
616         int num;
617
618         num = ((struct syscall_metadata *)call->data)->syscall_nr;
619
620         mutex_lock(&syscall_trace_lock);
621         if (!sys_perf_refcount_exit)
622                 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
623         if (ret) {
624                 pr_info("event trace: Could not activate"
625                                 "syscall exit trace point");
626         } else {
627                 set_bit(num, enabled_perf_exit_syscalls);
628                 sys_perf_refcount_exit++;
629         }
630         mutex_unlock(&syscall_trace_lock);
631         return ret;
632 }
633
634 void perf_sysexit_disable(struct ftrace_event_call *call)
635 {
636         int num;
637
638         num = ((struct syscall_metadata *)call->data)->syscall_nr;
639
640         mutex_lock(&syscall_trace_lock);
641         sys_perf_refcount_exit--;
642         clear_bit(num, enabled_perf_exit_syscalls);
643         if (!sys_perf_refcount_exit)
644                 unregister_trace_sys_exit(perf_syscall_exit, NULL);
645         mutex_unlock(&syscall_trace_lock);
646 }
647
648 #endif /* CONFIG_PERF_EVENTS */
649
650 static int syscall_enter_register(struct ftrace_event_call *event,
651                                  enum trace_reg type)
652 {
653         switch (type) {
654         case TRACE_REG_REGISTER:
655                 return reg_event_syscall_enter(event);
656         case TRACE_REG_UNREGISTER:
657                 unreg_event_syscall_enter(event);
658                 return 0;
659
660 #ifdef CONFIG_PERF_EVENTS
661         case TRACE_REG_PERF_REGISTER:
662                 return perf_sysenter_enable(event);
663         case TRACE_REG_PERF_UNREGISTER:
664                 perf_sysenter_disable(event);
665                 return 0;
666 #endif
667         }
668         return 0;
669 }
670
671 static int syscall_exit_register(struct ftrace_event_call *event,
672                                  enum trace_reg type)
673 {
674         switch (type) {
675         case TRACE_REG_REGISTER:
676                 return reg_event_syscall_exit(event);
677         case TRACE_REG_UNREGISTER:
678                 unreg_event_syscall_exit(event);
679                 return 0;
680
681 #ifdef CONFIG_PERF_EVENTS
682         case TRACE_REG_PERF_REGISTER:
683                 return perf_sysexit_enable(event);
684         case TRACE_REG_PERF_UNREGISTER:
685                 perf_sysexit_disable(event);
686                 return 0;
687 #endif
688         }
689         return 0;
690 }