tracing/filter: Swap entire filter of events
[linux-2.6.git] / kernel / trace / trace_syscalls.c
1 #include <trace/syscall.h>
2 #include <trace/events/syscalls.h>
3 #include <linux/slab.h>
4 #include <linux/kernel.h>
5 #include <linux/ftrace.h>
6 #include <linux/perf_event.h>
7 #include <asm/syscall.h>
8
9 #include "trace_output.h"
10 #include "trace.h"
11
12 static DEFINE_MUTEX(syscall_trace_lock);
13 static int sys_refcount_enter;
14 static int sys_refcount_exit;
15 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17
18 static int syscall_enter_register(struct ftrace_event_call *event,
19                                  enum trace_reg type);
20 static int syscall_exit_register(struct ftrace_event_call *event,
21                                  enum trace_reg type);
22
23 static int syscall_enter_define_fields(struct ftrace_event_call *call);
24 static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26 static struct list_head *
27 syscall_get_enter_fields(struct ftrace_event_call *call)
28 {
29         struct syscall_metadata *entry = call->data;
30
31         return &entry->enter_fields;
32 }
33
34 struct trace_event_functions enter_syscall_print_funcs = {
35         .trace          = print_syscall_enter,
36 };
37
38 struct trace_event_functions exit_syscall_print_funcs = {
39         .trace          = print_syscall_exit,
40 };
41
42 struct ftrace_event_class event_class_syscall_enter = {
43         .system         = "syscalls",
44         .reg            = syscall_enter_register,
45         .define_fields  = syscall_enter_define_fields,
46         .get_fields     = syscall_get_enter_fields,
47         .raw_init       = init_syscall_trace,
48 };
49
50 struct ftrace_event_class event_class_syscall_exit = {
51         .system         = "syscalls",
52         .reg            = syscall_exit_register,
53         .define_fields  = syscall_exit_define_fields,
54         .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
55         .raw_init       = init_syscall_trace,
56 };
57
58 extern struct syscall_metadata *__start_syscalls_metadata[];
59 extern struct syscall_metadata *__stop_syscalls_metadata[];
60
61 static struct syscall_metadata **syscalls_metadata;
62
63 static __init struct syscall_metadata *
64 find_syscall_meta(unsigned long syscall)
65 {
66         struct syscall_metadata **start;
67         struct syscall_metadata **stop;
68         char str[KSYM_SYMBOL_LEN];
69
70
71         start = __start_syscalls_metadata;
72         stop = __stop_syscalls_metadata;
73         kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74
75         for ( ; start < stop; start++) {
76                 /*
77                  * Only compare after the "sys" prefix. Archs that use
78                  * syscall wrappers may have syscalls symbols aliases prefixed
79                  * with "SyS" instead of "sys", leading to an unwanted
80                  * mismatch.
81                  */
82                 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83                         return *start;
84         }
85         return NULL;
86 }
87
88 static struct syscall_metadata *syscall_nr_to_meta(int nr)
89 {
90         if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
91                 return NULL;
92
93         return syscalls_metadata[nr];
94 }
95
96 enum print_line_t
97 print_syscall_enter(struct trace_iterator *iter, int flags,
98                     struct trace_event *event)
99 {
100         struct trace_seq *s = &iter->seq;
101         struct trace_entry *ent = iter->ent;
102         struct syscall_trace_enter *trace;
103         struct syscall_metadata *entry;
104         int i, ret, syscall;
105
106         trace = (typeof(trace))ent;
107         syscall = trace->nr;
108         entry = syscall_nr_to_meta(syscall);
109
110         if (!entry)
111                 goto end;
112
113         if (entry->enter_event->event.type != ent->type) {
114                 WARN_ON_ONCE(1);
115                 goto end;
116         }
117
118         ret = trace_seq_printf(s, "%s(", entry->name);
119         if (!ret)
120                 return TRACE_TYPE_PARTIAL_LINE;
121
122         for (i = 0; i < entry->nb_args; i++) {
123                 /* parameter types */
124                 if (trace_flags & TRACE_ITER_VERBOSE) {
125                         ret = trace_seq_printf(s, "%s ", entry->types[i]);
126                         if (!ret)
127                                 return TRACE_TYPE_PARTIAL_LINE;
128                 }
129                 /* parameter values */
130                 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
131                                        trace->args[i],
132                                        i == entry->nb_args - 1 ? "" : ", ");
133                 if (!ret)
134                         return TRACE_TYPE_PARTIAL_LINE;
135         }
136
137         ret = trace_seq_putc(s, ')');
138         if (!ret)
139                 return TRACE_TYPE_PARTIAL_LINE;
140
141 end:
142         ret =  trace_seq_putc(s, '\n');
143         if (!ret)
144                 return TRACE_TYPE_PARTIAL_LINE;
145
146         return TRACE_TYPE_HANDLED;
147 }
148
149 enum print_line_t
150 print_syscall_exit(struct trace_iterator *iter, int flags,
151                    struct trace_event *event)
152 {
153         struct trace_seq *s = &iter->seq;
154         struct trace_entry *ent = iter->ent;
155         struct syscall_trace_exit *trace;
156         int syscall;
157         struct syscall_metadata *entry;
158         int ret;
159
160         trace = (typeof(trace))ent;
161         syscall = trace->nr;
162         entry = syscall_nr_to_meta(syscall);
163
164         if (!entry) {
165                 trace_seq_printf(s, "\n");
166                 return TRACE_TYPE_HANDLED;
167         }
168
169         if (entry->exit_event->event.type != ent->type) {
170                 WARN_ON_ONCE(1);
171                 return TRACE_TYPE_UNHANDLED;
172         }
173
174         ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
175                                 trace->ret);
176         if (!ret)
177                 return TRACE_TYPE_PARTIAL_LINE;
178
179         return TRACE_TYPE_HANDLED;
180 }
181
182 extern char *__bad_type_size(void);
183
184 #define SYSCALL_FIELD(type, name)                                       \
185         sizeof(type) != sizeof(trace.name) ?                            \
186                 __bad_type_size() :                                     \
187                 #type, #name, offsetof(typeof(trace), name),            \
188                 sizeof(trace.name), is_signed_type(type)
189
190 static
191 int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
192 {
193         int i;
194         int pos = 0;
195
196         /* When len=0, we just calculate the needed length */
197 #define LEN_OR_ZERO (len ? len - pos : 0)
198
199         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
200         for (i = 0; i < entry->nb_args; i++) {
201                 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
202                                 entry->args[i], sizeof(unsigned long),
203                                 i == entry->nb_args - 1 ? "" : ", ");
204         }
205         pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
206
207         for (i = 0; i < entry->nb_args; i++) {
208                 pos += snprintf(buf + pos, LEN_OR_ZERO,
209                                 ", ((unsigned long)(REC->%s))", entry->args[i]);
210         }
211
212 #undef LEN_OR_ZERO
213
214         /* return the length of print_fmt */
215         return pos;
216 }
217
218 static int set_syscall_print_fmt(struct ftrace_event_call *call)
219 {
220         char *print_fmt;
221         int len;
222         struct syscall_metadata *entry = call->data;
223
224         if (entry->enter_event != call) {
225                 call->print_fmt = "\"0x%lx\", REC->ret";
226                 return 0;
227         }
228
229         /* First: called with 0 length to calculate the needed length */
230         len = __set_enter_print_fmt(entry, NULL, 0);
231
232         print_fmt = kmalloc(len + 1, GFP_KERNEL);
233         if (!print_fmt)
234                 return -ENOMEM;
235
236         /* Second: actually write the @print_fmt */
237         __set_enter_print_fmt(entry, print_fmt, len + 1);
238         call->print_fmt = print_fmt;
239
240         return 0;
241 }
242
243 static void free_syscall_print_fmt(struct ftrace_event_call *call)
244 {
245         struct syscall_metadata *entry = call->data;
246
247         if (entry->enter_event == call)
248                 kfree(call->print_fmt);
249 }
250
251 static int syscall_enter_define_fields(struct ftrace_event_call *call)
252 {
253         struct syscall_trace_enter trace;
254         struct syscall_metadata *meta = call->data;
255         int ret;
256         int i;
257         int offset = offsetof(typeof(trace), args);
258
259         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
260         if (ret)
261                 return ret;
262
263         for (i = 0; i < meta->nb_args; i++) {
264                 ret = trace_define_field(call, meta->types[i],
265                                          meta->args[i], offset,
266                                          sizeof(unsigned long), 0,
267                                          FILTER_OTHER);
268                 offset += sizeof(unsigned long);
269         }
270
271         return ret;
272 }
273
274 static int syscall_exit_define_fields(struct ftrace_event_call *call)
275 {
276         struct syscall_trace_exit trace;
277         int ret;
278
279         ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
280         if (ret)
281                 return ret;
282
283         ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
284                                  FILTER_OTHER);
285
286         return ret;
287 }
288
289 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
290 {
291         struct syscall_trace_enter *entry;
292         struct syscall_metadata *sys_data;
293         struct ring_buffer_event *event;
294         struct ring_buffer *buffer;
295         int size;
296         int syscall_nr;
297
298         syscall_nr = syscall_get_nr(current, regs);
299         if (syscall_nr < 0)
300                 return;
301         if (!test_bit(syscall_nr, enabled_enter_syscalls))
302                 return;
303
304         sys_data = syscall_nr_to_meta(syscall_nr);
305         if (!sys_data)
306                 return;
307
308         size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
309
310         event = trace_current_buffer_lock_reserve(&buffer,
311                         sys_data->enter_event->event.type, size, 0, 0);
312         if (!event)
313                 return;
314
315         entry = ring_buffer_event_data(event);
316         entry->nr = syscall_nr;
317         syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
318
319         if (!filter_current_check_discard(buffer, sys_data->enter_event,
320                                           entry, event))
321                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
322 }
323
324 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
325 {
326         struct syscall_trace_exit *entry;
327         struct syscall_metadata *sys_data;
328         struct ring_buffer_event *event;
329         struct ring_buffer *buffer;
330         int syscall_nr;
331
332         syscall_nr = syscall_get_nr(current, regs);
333         if (syscall_nr < 0)
334                 return;
335         if (!test_bit(syscall_nr, enabled_exit_syscalls))
336                 return;
337
338         sys_data = syscall_nr_to_meta(syscall_nr);
339         if (!sys_data)
340                 return;
341
342         event = trace_current_buffer_lock_reserve(&buffer,
343                         sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
344         if (!event)
345                 return;
346
347         entry = ring_buffer_event_data(event);
348         entry->nr = syscall_nr;
349         entry->ret = syscall_get_return_value(current, regs);
350
351         if (!filter_current_check_discard(buffer, sys_data->exit_event,
352                                           entry, event))
353                 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
354 }
355
356 int reg_event_syscall_enter(struct ftrace_event_call *call)
357 {
358         int ret = 0;
359         int num;
360
361         num = ((struct syscall_metadata *)call->data)->syscall_nr;
362         if (num < 0 || num >= NR_syscalls)
363                 return -ENOSYS;
364         mutex_lock(&syscall_trace_lock);
365         if (!sys_refcount_enter)
366                 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
367         if (!ret) {
368                 set_bit(num, enabled_enter_syscalls);
369                 sys_refcount_enter++;
370         }
371         mutex_unlock(&syscall_trace_lock);
372         return ret;
373 }
374
375 void unreg_event_syscall_enter(struct ftrace_event_call *call)
376 {
377         int num;
378
379         num = ((struct syscall_metadata *)call->data)->syscall_nr;
380         if (num < 0 || num >= NR_syscalls)
381                 return;
382         mutex_lock(&syscall_trace_lock);
383         sys_refcount_enter--;
384         clear_bit(num, enabled_enter_syscalls);
385         if (!sys_refcount_enter)
386                 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
387         mutex_unlock(&syscall_trace_lock);
388 }
389
390 int reg_event_syscall_exit(struct ftrace_event_call *call)
391 {
392         int ret = 0;
393         int num;
394
395         num = ((struct syscall_metadata *)call->data)->syscall_nr;
396         if (num < 0 || num >= NR_syscalls)
397                 return -ENOSYS;
398         mutex_lock(&syscall_trace_lock);
399         if (!sys_refcount_exit)
400                 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
401         if (!ret) {
402                 set_bit(num, enabled_exit_syscalls);
403                 sys_refcount_exit++;
404         }
405         mutex_unlock(&syscall_trace_lock);
406         return ret;
407 }
408
409 void unreg_event_syscall_exit(struct ftrace_event_call *call)
410 {
411         int num;
412
413         num = ((struct syscall_metadata *)call->data)->syscall_nr;
414         if (num < 0 || num >= NR_syscalls)
415                 return;
416         mutex_lock(&syscall_trace_lock);
417         sys_refcount_exit--;
418         clear_bit(num, enabled_exit_syscalls);
419         if (!sys_refcount_exit)
420                 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
421         mutex_unlock(&syscall_trace_lock);
422 }
423
424 int init_syscall_trace(struct ftrace_event_call *call)
425 {
426         int id;
427
428         if (set_syscall_print_fmt(call) < 0)
429                 return -ENOMEM;
430
431         id = trace_event_raw_init(call);
432
433         if (id < 0) {
434                 free_syscall_print_fmt(call);
435                 return id;
436         }
437
438         return id;
439 }
440
441 unsigned long __init arch_syscall_addr(int nr)
442 {
443         return (unsigned long)sys_call_table[nr];
444 }
445
446 int __init init_ftrace_syscalls(void)
447 {
448         struct syscall_metadata *meta;
449         unsigned long addr;
450         int i;
451
452         syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
453                                         NR_syscalls, GFP_KERNEL);
454         if (!syscalls_metadata) {
455                 WARN_ON(1);
456                 return -ENOMEM;
457         }
458
459         for (i = 0; i < NR_syscalls; i++) {
460                 addr = arch_syscall_addr(i);
461                 meta = find_syscall_meta(addr);
462                 if (!meta)
463                         continue;
464
465                 meta->syscall_nr = i;
466                 syscalls_metadata[i] = meta;
467         }
468
469         return 0;
470 }
471 core_initcall(init_ftrace_syscalls);
472
473 #ifdef CONFIG_PERF_EVENTS
474
475 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
476 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
477 static int sys_perf_refcount_enter;
478 static int sys_perf_refcount_exit;
479
480 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
481 {
482         struct syscall_metadata *sys_data;
483         struct syscall_trace_enter *rec;
484         struct hlist_head *head;
485         int syscall_nr;
486         int rctx;
487         int size;
488
489         syscall_nr = syscall_get_nr(current, regs);
490         if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
491                 return;
492
493         sys_data = syscall_nr_to_meta(syscall_nr);
494         if (!sys_data)
495                 return;
496
497         /* get the size after alignment with the u32 buffer size field */
498         size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
499         size = ALIGN(size + sizeof(u32), sizeof(u64));
500         size -= sizeof(u32);
501
502         if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
503                       "perf buffer not large enough"))
504                 return;
505
506         rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
507                                 sys_data->enter_event->event.type, regs, &rctx);
508         if (!rec)
509                 return;
510
511         rec->nr = syscall_nr;
512         syscall_get_arguments(current, regs, 0, sys_data->nb_args,
513                                (unsigned long *)&rec->args);
514
515         head = this_cpu_ptr(sys_data->enter_event->perf_events);
516         perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
517 }
518
519 int perf_sysenter_enable(struct ftrace_event_call *call)
520 {
521         int ret = 0;
522         int num;
523
524         num = ((struct syscall_metadata *)call->data)->syscall_nr;
525
526         mutex_lock(&syscall_trace_lock);
527         if (!sys_perf_refcount_enter)
528                 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
529         if (ret) {
530                 pr_info("event trace: Could not activate"
531                                 "syscall entry trace point");
532         } else {
533                 set_bit(num, enabled_perf_enter_syscalls);
534                 sys_perf_refcount_enter++;
535         }
536         mutex_unlock(&syscall_trace_lock);
537         return ret;
538 }
539
540 void perf_sysenter_disable(struct ftrace_event_call *call)
541 {
542         int num;
543
544         num = ((struct syscall_metadata *)call->data)->syscall_nr;
545
546         mutex_lock(&syscall_trace_lock);
547         sys_perf_refcount_enter--;
548         clear_bit(num, enabled_perf_enter_syscalls);
549         if (!sys_perf_refcount_enter)
550                 unregister_trace_sys_enter(perf_syscall_enter, NULL);
551         mutex_unlock(&syscall_trace_lock);
552 }
553
554 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
555 {
556         struct syscall_metadata *sys_data;
557         struct syscall_trace_exit *rec;
558         struct hlist_head *head;
559         int syscall_nr;
560         int rctx;
561         int size;
562
563         syscall_nr = syscall_get_nr(current, regs);
564         if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
565                 return;
566
567         sys_data = syscall_nr_to_meta(syscall_nr);
568         if (!sys_data)
569                 return;
570
571         /* We can probably do that at build time */
572         size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
573         size -= sizeof(u32);
574
575         /*
576          * Impossible, but be paranoid with the future
577          * How to put this check outside runtime?
578          */
579         if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
580                 "exit event has grown above perf buffer size"))
581                 return;
582
583         rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
584                                 sys_data->exit_event->event.type, regs, &rctx);
585         if (!rec)
586                 return;
587
588         rec->nr = syscall_nr;
589         rec->ret = syscall_get_return_value(current, regs);
590
591         head = this_cpu_ptr(sys_data->exit_event->perf_events);
592         perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
593 }
594
595 int perf_sysexit_enable(struct ftrace_event_call *call)
596 {
597         int ret = 0;
598         int num;
599
600         num = ((struct syscall_metadata *)call->data)->syscall_nr;
601
602         mutex_lock(&syscall_trace_lock);
603         if (!sys_perf_refcount_exit)
604                 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
605         if (ret) {
606                 pr_info("event trace: Could not activate"
607                                 "syscall exit trace point");
608         } else {
609                 set_bit(num, enabled_perf_exit_syscalls);
610                 sys_perf_refcount_exit++;
611         }
612         mutex_unlock(&syscall_trace_lock);
613         return ret;
614 }
615
616 void perf_sysexit_disable(struct ftrace_event_call *call)
617 {
618         int num;
619
620         num = ((struct syscall_metadata *)call->data)->syscall_nr;
621
622         mutex_lock(&syscall_trace_lock);
623         sys_perf_refcount_exit--;
624         clear_bit(num, enabled_perf_exit_syscalls);
625         if (!sys_perf_refcount_exit)
626                 unregister_trace_sys_exit(perf_syscall_exit, NULL);
627         mutex_unlock(&syscall_trace_lock);
628 }
629
630 #endif /* CONFIG_PERF_EVENTS */
631
632 static int syscall_enter_register(struct ftrace_event_call *event,
633                                  enum trace_reg type)
634 {
635         switch (type) {
636         case TRACE_REG_REGISTER:
637                 return reg_event_syscall_enter(event);
638         case TRACE_REG_UNREGISTER:
639                 unreg_event_syscall_enter(event);
640                 return 0;
641
642 #ifdef CONFIG_PERF_EVENTS
643         case TRACE_REG_PERF_REGISTER:
644                 return perf_sysenter_enable(event);
645         case TRACE_REG_PERF_UNREGISTER:
646                 perf_sysenter_disable(event);
647                 return 0;
648 #endif
649         }
650         return 0;
651 }
652
653 static int syscall_exit_register(struct ftrace_event_call *event,
654                                  enum trace_reg type)
655 {
656         switch (type) {
657         case TRACE_REG_REGISTER:
658                 return reg_event_syscall_exit(event);
659         case TRACE_REG_UNREGISTER:
660                 unreg_event_syscall_exit(event);
661                 return 0;
662
663 #ifdef CONFIG_PERF_EVENTS
664         case TRACE_REG_PERF_REGISTER:
665                 return perf_sysexit_enable(event);
666         case TRACE_REG_PERF_UNREGISTER:
667                 perf_sysexit_disable(event);
668                 return 0;
669 #endif
670         }
671         return 0;
672 }