Merge branch 'sched/core' into cpus4096
Ingo Molnar [Fri, 12 Dec 2008 12:48:57 +0000 (13:48 +0100)]
Conflicts:
include/linux/ftrace.h
kernel/sched.c

146 files changed:
Documentation/ftrace.txt
Documentation/kernel-parameters.txt
Documentation/lockstat.txt
Documentation/markers.txt
Documentation/tracepoints.txt
arch/ia64/include/asm/topology.h
arch/mips/include/asm/mach-ip27/topology.h
arch/powerpc/include/asm/ftrace.h
arch/powerpc/include/asm/module.h
arch/powerpc/include/asm/topology.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/ftrace.c
arch/powerpc/kernel/idle.c
arch/powerpc/kernel/module_32.c
arch/powerpc/kernel/module_64.c
arch/powerpc/lib/Makefile
arch/sh/include/asm/topology.h
arch/um/include/asm/system.h
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Kconfig.debug
arch/x86/include/asm/ds.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uaccess_32.h
arch/x86/include/asm/uaccess_64.h
arch/x86/kernel/Makefile
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/ds.c
arch/x86/kernel/dumpstack.c [new file with mode: 0644]
arch/x86/kernel/dumpstack.h [new file with mode: 0644]
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/ftrace.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/stacktrace.c
arch/x86/kernel/vsyscall_64.c
arch/x86/lib/usercopy_32.c
arch/x86/lib/usercopy_64.c
arch/x86/mm/Makefile
arch/x86/mm/fault.c
arch/x86/vdso/vclock_gettime.c
block/Kconfig
block/blk-core.c
block/blktrace.c
block/elevator.c
drivers/char/sysrq.c
drivers/md/dm.c
fs/bio.c
fs/seq_file.c
include/asm-generic/vmlinux.lds.h
include/linux/blktrace_api.h
include/linux/compiler.h
include/linux/debug_locks.h
include/linux/ftrace.h
include/linux/ftrace_irq.h [new file with mode: 0644]
include/linux/futex.h
include/linux/hardirq.h
include/linux/kernel.h
include/linux/lockdep.h
include/linux/marker.h
include/linux/mutex.h
include/linux/pid.h
include/linux/rcuclassic.h
include/linux/rcupdate.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/seq_file.h
include/linux/stacktrace.h
include/linux/tracepoint.h
include/linux/tty.h
include/linux/uaccess.h
include/trace/block.h [new file with mode: 0644]
include/trace/boot.h [new file with mode: 0644]
include/trace/sched.h
init/Kconfig
init/main.c
kernel/exit.c
kernel/extable.c
kernel/fork.c
kernel/futex.c
kernel/kthread.c
kernel/lockdep.c
kernel/lockdep_proc.c
kernel/marker.c
kernel/module.c
kernel/mutex.c
kernel/notifier.c
kernel/posix-cpu-timers.c
kernel/power/disk.c
kernel/power/main.c
kernel/profile.c
kernel/rcuclassic.c
kernel/sched.c
kernel/sched_cpupri.c
kernel/sched_cpupri.h
kernel/sched_fair.c
kernel/sched_rt.c
kernel/sched_stats.h
kernel/signal.c
kernel/softlockup.c
kernel/sys.c
kernel/sysctl.c
kernel/time/tick-sched.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_boot.c
kernel/trace/trace_branch.c [new file with mode: 0644]
kernel/trace/trace_bts.c [new file with mode: 0644]
kernel/trace/trace_functions.c
kernel/trace/trace_functions_graph.c [new file with mode: 0644]
kernel/trace/trace_irqsoff.c
kernel/trace/trace_mmiotrace.c
kernel/trace/trace_nop.c
kernel/trace/trace_power.c [new file with mode: 0644]
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
kernel/trace/trace_selftest.c
kernel/trace/trace_stack.c
kernel/trace/trace_sysprof.c
kernel/tracepoint.c
lib/Kconfig.debug
mm/bounce.c
mm/memory.c
samples/tracepoints/tp-samples-trace.h
samples/tracepoints/tracepoint-probe-sample.c
samples/tracepoints/tracepoint-probe-sample2.c
samples/tracepoints/tracepoint-sample.c
scripts/Makefile.build
scripts/bootgraph.pl
scripts/recordmcount.pl
scripts/trace/power.pl [new file with mode: 0644]
scripts/tracing/draw_functrace.py [new file with mode: 0644]

index 9cc4d68..803b131 100644 (file)
@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
                tracer is not adding more data, they will display
                the same information every time they are read.
 
-  iter_ctrl: This file lets the user control the amount of data
+  trace_options: This file lets the user control the amount of data
                that is displayed in one of the above output
                files.
 
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
                only be recorded if the latency is greater than
                the value in this file. (in microseconds)
 
-  trace_entries: This sets or displays the number of bytes each CPU
+  buffer_size_kb: This sets or displays the number of kilobytes each CPU
                buffer can hold. The tracer buffers are the same size
                for each CPU. The displayed number is the size of the
-                CPU buffer and not total size of all buffers. The
+               CPU buffer and not total size of all buffers. The
                trace buffers are allocated in pages (blocks of memory
                that the kernel uses for allocation, usually 4 KB in size).
                If the last page allocated has room for more bytes
@@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files:
                be traced. If a function exists in both set_ftrace_filter
                and set_ftrace_notrace, the function will _not_ be traced.
 
+  set_ftrace_pid: Have the function tracer only trace a single thread.
+
   available_filter_functions: This lists the functions that ftrace
                has processed and can trace. These are the function
                names that you can pass to "set_ftrace_filter" or
@@ -316,23 +318,23 @@ The above is mostly meaningful for kernel developers.
   The rest is the same as the 'trace' file.
 
 
-iter_ctrl
----------
+trace_options
+-------------
 
-The iter_ctrl file is used to control what gets printed in the trace
+The trace_options file is used to control what gets printed in the trace
 output. To see what is available, simply cat the file:
 
-  cat /debug/tracing/iter_ctrl
+  cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree
+ noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
 
 To disable one of the options, echo in the option prepended with "no".
 
-  echo noprint-parent > /debug/tracing/iter_ctrl
+  echo noprint-parent > /debug/tracing/trace_options
 
 To enable an option, leave off the "no".
 
-  echo sym-offset > /debug/tracing/iter_ctrl
+  echo sym-offset > /debug/tracing/trace_options
 
 Here are the available options:
 
@@ -378,6 +380,20 @@ Here are the available options:
                When a trace is recorded, so is the stack of functions.
                This allows for back traces of trace sites.
 
+  userstacktrace - This option changes the trace.
+                  It records a stacktrace of the current userspace thread.
+
+  sym-userobj - when user stacktrace are enabled, look up which object the
+               address belongs to, and print a relative address
+               This is especially useful when ASLR is on, otherwise you don't
+               get a chance to resolve the address to object/file/line after the app is no
+               longer running
+
+               The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
+
+               a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
+x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
+
   sched-tree - TBD (any users??)
 
 
@@ -1059,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else,
 a search through /proc/mounts may be needed to find where the debugfs
 file-system is mounted.
 
+
+Single thread tracing
+---------------------
+
+By writing into /debug/tracing/set_ftrace_pid you can trace a
+single thread. For example:
+
+# cat /debug/tracing/set_ftrace_pid
+no pid
+# echo 3111 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/set_ftrace_pid
+3111
+# echo function > /debug/tracing/current_tracer
+# cat /debug/tracing/trace | head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
+     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
+     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
+     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
+     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
+     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
+# echo -1 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/trace |head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+ ##### CPU 3 buffer started ####
+     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
+     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
+     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
+
+If you want to trace a function when executing, you could use
+something like this simple program:
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main (int argc, char **argv)
+{
+        if (argc < 1)
+                exit(-1);
+
+        if (fork() > 0) {
+                int fd, ffd;
+                char line[64];
+                int s;
+
+                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
+                if (ffd < 0)
+                        exit(-1);
+                write(ffd, "nop", 3);
+
+                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
+                s = sprintf(line, "%d\n", getpid());
+                write(fd, line, s);
+
+                write(ffd, "function", 8);
+
+                close(fd);
+                close(ffd);
+
+                execvp(argv[1], argv+1);
+        }
+
+        return 0;
+}
+
 dynamic ftrace
 --------------
 
@@ -1158,7 +1251,11 @@ These are the only wild cards which are supported.
 
   <match>*<match> will not work.
 
- # echo hrtimer_* > /debug/tracing/set_ftrace_filter
+Note: It is better to use quotes to enclose the wild cards, otherwise
+  the shell may expand the parameters into names of files in the local
+  directory.
+
+ # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
 
 Produces:
 
@@ -1213,7 +1310,7 @@ Again, now we want to append.
  # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 sys_nanosleep
- # echo hrtimer_* >> /debug/tracing/set_ftrace_filter
+ # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 hrtimer_run_queues
 hrtimer_run_pending
@@ -1299,41 +1396,29 @@ trace entries
 -------------
 
 Having too much or not enough data can be troublesome in diagnosing
-an issue in the kernel. The file trace_entries is used to modify
+an issue in the kernel. The file buffer_size_kb is used to modify
 the size of the internal trace buffers. The number listed
 is the number of entries that can be recorded per CPU. To know
 the full size, multiply the number of possible CPUS with the
 number of entries.
 
- # cat /debug/tracing/trace_entries
-65620
+ # cat /debug/tracing/buffer_size_kb
+1408 (units kilobytes)
 
 Note, to modify this, you must have tracing completely disabled. To do that,
 echo "nop" into the current_tracer. If the current_tracer is not set
 to "nop", an EINVAL error will be returned.
 
  # echo nop > /debug/tracing/current_tracer
- # echo 100000 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-100045
-
-
-Notice that we echoed in 100,000 but the size is 100,045. The entries
-are held in individual pages. It allocates the number of pages it takes
-to fulfill the request. If more entries may fit on the last page
-then they will be added.
-
- # echo 1 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-85
-
-This shows us that 85 entries can fit in a single page.
+ # echo 10000 > /debug/tracing/buffer_size_kb
+ # cat /debug/tracing/buffer_size_kb
+10000 (units kilobytes)
 
 The number of pages which will be allocated is limited to a percentage
 of available memory. Allocating too much will produce an error.
 
- # echo 1000000000000 > /debug/tracing/trace_entries
+ # echo 1000000000000 > /debug/tracing/buffer_size_kb
 -bash: echo: write error: Cannot allocate memory
- # cat /debug/tracing/trace_entries
+ # cat /debug/tracing/buffer_size_kb
 85
 
index e0f346d..2919a2e 100644 (file)
@@ -750,6 +750,14 @@ and is between 256 and 4096 characters. It is defined in the file
                        parameter will force ia64_sal_cache_flush to call
                        ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
+       ftrace=[tracer]
+                       [ftrace] will set and start the specified tracer
+                       as early as possible in order to facilitate early
+                       boot debugging.
+
+       ftrace_dump_on_oops
+                       [ftrace] will dump the trace buffers on oops.
+
        gamecon.map[2|3]=
                        [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                        support via parallel port (up to 5 devices per port)
index 4ba4664..9cb9138 100644 (file)
@@ -71,35 +71,50 @@ Look at the current lock statistics:
 
 # less /proc/lock_stat
 
-01 lock_stat version 0.2
+01 lock_stat version 0.3
 02 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 03                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
 04 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 05
-06               &inode->i_data.tree_lock-W:            15          21657           0.18     1093295.30 11547131054.85             58          10415           0.16          87.51        6387.60
-07               &inode->i_data.tree_lock-R:             0              0           0.00           0.00           0.00          23302         231198           0.25           8.45       98023.38
-08               --------------------------
-09                 &inode->i_data.tree_lock              0          [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190
-10
-11 ...............................................................................................................................................................................................
-12
-13                              dcache_lock:          1037           1161           0.38          45.32         774.51           6611         243371           0.15         306.48       77387.24
-14                              -----------
-15                              dcache_lock            180          [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230
-16                              dcache_lock            165          [<ffffffff802c002a>] d_alloc+0x15a/0x210
-17                              dcache_lock             33          [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70
-18                              dcache_lock              1          [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130
+06                          &mm->mmap_sem-W:           233            538 18446744073708       22924.27      607243.51           1342          45806           1.71        8595.89     1180582.34
+07                          &mm->mmap_sem-R:           205            587 18446744073708       28403.36      731975.00           1940         412426           0.58      187825.45     6307502.88
+08                          ---------------
+09                            &mm->mmap_sem            487          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+10                            &mm->mmap_sem            179          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+11                            &mm->mmap_sem            279          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+12                            &mm->mmap_sem             76          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+13                          ---------------
+14                            &mm->mmap_sem            270          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+15                            &mm->mmap_sem            431          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+16                            &mm->mmap_sem            138          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+17                            &mm->mmap_sem            145          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+18
+19 ...............................................................................................................................................................................................
+20
+21                              dcache_lock:           621            623           0.52         118.26        1053.02           6745          91930           0.29         316.29      118423.41
+22                              -----------
+23                              dcache_lock            179          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+24                              dcache_lock            113          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+25                              dcache_lock             99          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+26                              dcache_lock            104          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
+27                              -----------
+28                              dcache_lock            192          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+29                              dcache_lock             98          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+30                              dcache_lock             72          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+31                              dcache_lock            112          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
 
 This excerpt shows the first two lock class statistics. Line 01 shows the
 output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-10 and 13-18 show the actual
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
 statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 14) from the contention points.
+short separator (line 08, 13) from the contention points.
 
-The first lock (05-10) is a read/write lock, and shows two lines above the
+The first lock (05-18) is a read/write lock, and shows two lines above the
 short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol.
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
 
+The integer part of the time values is in us.
 
 View the top contending locks:
 
index 089f613..d2b3d0e 100644 (file)
@@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be
 activated by calling marker_arm(). Marker deactivation can be done by calling
 marker_disarm() as many times as marker_arm() has been called. Removing a probe
 is done through marker_probe_unregister(); it will disarm the probe.
-marker_synchronize_unregister() must be called before the end of the module exit
-function to make sure there is no caller left using the probe. This, and the
-fact that preemption is disabled around the probe call, make sure that probe
-removal and module unload are safe. See the "Probe example" section below for a
-sample probe module.
+
+marker_synchronize_unregister() must be called between probe unregistration and
+the first occurrence of
+- the end of module exit function,
+  to make sure there is no caller left using the probe;
+- the free of any resource used by the probes,
+  to make sure the probes wont be accessing invalid data.
+This, and the fact that preemption is disabled around the probe call, make sure
+that probe removal and module unload are safe. See the "Probe example" section
+below for a sample probe module.
 
 The marker mechanism supports inserting multiple instances of the same marker.
 Markers can be put in inline functions, inlined static functions, and
@@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency:
 
 "Format mismatch for probe probe_name (format), marker (format)"
 
+Another way to use markers is to simply define the marker without generating any
+function call to actually call into the marker. This is useful in combination
+with tracepoint probes in a scheme like this :
+
+void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
+
+DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
+       "arg1 %u pid %d");
+
+notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
+{
+       struct marker *marker = &GET_MARKER(kernel_irq_entry);
+       /* write data to trace buffers ... */
+}
 
 * Probe / marker example
 
index 5d354e1..6f0a044 100644 (file)
@@ -3,28 +3,30 @@
                            Mathieu Desnoyers
 
 
-This document introduces Linux Kernel Tracepoints and their use. It provides
-examples of how to insert tracepoints in the kernel and connect probe functions
-to them and provides some examples of probe functions.
+This document introduces Linux Kernel Tracepoints and their use. It
+provides examples of how to insert tracepoints in the kernel and
+connect probe functions to them and provides some examples of probe
+functions.
 
 
 * Purpose of tracepoints
 
-A tracepoint placed in code provides a hook to call a function (probe) that you
-can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or
-"off" (no probe is attached). When a tracepoint is "off" it has no effect,
-except for adding a tiny time penalty (checking a condition for a branch) and
-space penalty (adding a few bytes for the function call at the end of the
-instrumented function and adds a data structure in a separate section).  When a
-tracepoint is "on", the function you provide is called each time the tracepoint
-is executed, in the execution context of the caller. When the function provided
-ends its execution, it returns to the caller (continuing from the tracepoint
-site).
+A tracepoint placed in code provides a hook to call a function (probe)
+that you can provide at runtime. A tracepoint can be "on" (a probe is
+connected to it) or "off" (no probe is attached). When a tracepoint is
+"off" it has no effect, except for adding a tiny time penalty
+(checking a condition for a branch) and space penalty (adding a few
+bytes for the function call at the end of the instrumented function
+and adds a data structure in a separate section).  When a tracepoint
+is "on", the function you provide is called each time the tracepoint
+is executed, in the execution context of the caller. When the function
+provided ends its execution, it returns to the caller (continuing from
+the tracepoint site).
 
 You can put tracepoints at important locations in the code. They are
 lightweight hooks that can pass an arbitrary number of parameters,
-which prototypes are described in a tracepoint declaration placed in a header
-file.
+which prototypes are described in a tracepoint declaration placed in a
+header file.
 
 They can be used for tracing and performance accounting.
 
@@ -42,14 +44,16 @@ In include/trace/subsys.h :
 
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_eventname,
-       TPPTOTO(int firstarg, struct task_struct *p),
+DECLARE_TRACE(subsys_eventname,
+       TPPROTO(int firstarg, struct task_struct *p),
        TPARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
 #include <trace/subsys.h>
 
+DEFINE_TRACE(subsys_eventname);
+
 void somefct(void)
 {
        ...
@@ -61,31 +65,41 @@ Where :
 - subsys_eventname is an identifier unique to your event
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
-- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
-  called by this tracepoint.
-- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
 
-Connecting a function (probe) to a tracepoint is done by providing a probe
-(function to call) for the specific tracepoint through
-register_trace_subsys_eventname().  Removing a probe is done through
-unregister_trace_subsys_eventname(); it will remove the probe sure there is no
-caller left using the probe when it returns. Probe removal is preempt-safe
-because preemption is disabled around the probe call. See the "Probe example"
-section below for a sample probe module.
-
-The tracepoint mechanism supports inserting multiple instances of the same
-tracepoint, but a single definition must be made of a given tracepoint name over
-all the kernel to make sure no type conflict will occur. Name mangling of the
-tracepoints is done using the prototypes to make sure typing is correct.
-Verification of probe type correctness is done at the registration site by the
-compiler. Tracepoints can be put in inline functions, inlined static functions,
-and unrolled loops as well as regular functions.
-
-The naming scheme "subsys_event" is suggested here as a convention intended
-to limit collisions. Tracepoint names are global to the kernel: they are
-considered as being the same whether they are in the core kernel image or in
-modules.
+- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+  function called by this tracepoint.
 
+- TPARGS(firstarg, p) are the parameters names, same as found in the
+  prototype.
+
+Connecting a function (probe) to a tracepoint is done by providing a
+probe (function to call) for the specific tracepoint through
+register_trace_subsys_eventname().  Removing a probe is done through
+unregister_trace_subsys_eventname(); it will remove the probe.
+
+tracepoint_synchronize_unregister() must be called before the end of
+the module exit function to make sure there is no caller left using
+the probe. This, and the fact that preemption is disabled around the
+probe call, make sure that probe removal and module unload are safe.
+See the "Probe example" section below for a sample probe module.
+
+The tracepoint mechanism supports inserting multiple instances of the
+same tracepoint, but a single definition must be made of a given
+tracepoint name over all the kernel to make sure no type conflict will
+occur. Name mangling of the tracepoints is done using the prototypes
+to make sure typing is correct. Verification of probe type correctness
+is done at the registration site by the compiler. Tracepoints can be
+put in inline functions, inlined static functions, and unrolled loops
+as well as regular functions.
+
+The naming scheme "subsys_event" is suggested here as a convention
+intended to limit collisions. Tracepoint names are global to the
+kernel: they are considered as being the same whether they are in the
+core kernel image or in modules.
+
+If the tracepoint has to be used in kernel modules, an
+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
+used to export the defined tracepoints.
 
 * Probe / tracepoint example
 
index 35bcb64..a3cc9f6 100644 (file)
@@ -55,7 +55,6 @@
 void build_cpu_to_node_map(void);
 
 #define SD_CPU_INIT (struct sched_domain) {            \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
@@ -80,7 +79,6 @@ void build_cpu_to_node_map(void);
 
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 7785bec..1fb959f 100644 (file)
@@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 
 /* sched_domains SD_NODE_INIT for SGI IP27 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index b298f7a..e5f2ae8 100644 (file)
@@ -7,7 +7,19 @@
 
 #ifndef __ASSEMBLY__
 extern void _mcount(void);
-#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+       /* reloction of mcount call site is the same as the address */
+       return addr;
+}
+
+struct dyn_arch_ftrace {
+       struct module *mod;
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 
 #endif
 
index e5f14b1..0845488 100644 (file)
@@ -34,11 +34,19 @@ struct mod_arch_specific {
 #ifdef __powerpc64__
        unsigned int stubs_section;     /* Index of stubs section in module */
        unsigned int toc_section;       /* What section is the TOC? */
-#else
+#ifdef CONFIG_DYNAMIC_FTRACE
+       unsigned long toc;
+       unsigned long tramp;
+#endif
+
+#else /* powerpc64 */
        /* Indices of PLT sections within module. */
        unsigned int core_plt_section;
        unsigned int init_plt_section;
+#ifdef CONFIG_DYNAMIC_FTRACE
+       unsigned long tramp;
 #endif
+#endif /* powerpc64 */
 
        /* List of BUG addresses, source line numbers and filenames */
        struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
 #    endif     /* MODULE */
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+#    ifdef MODULE
+       asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
+#    endif     /* MODULE */
+#endif
+
 
 struct exception_table_entry;
 void sort_ex_table(struct exception_table_entry *start,
index c32da6f..373fca3 100644 (file)
@@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 /* sched_domains SD_NODE_INIT for PPC64 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 92673b4..d17edb4 100644 (file)
@@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
 
 ifdef CONFIG_DYNAMIC_FTRACE
 # dynamic ftrace setup.
index 7ecc0d1..6f7eb7e 100644 (file)
@@ -1162,39 +1162,17 @@ machine_check_in_rtas:
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
-       stwu    r1,-48(r1)
-       stw     r3, 12(r1)
-       stw     r4, 16(r1)
-       stw     r5, 20(r1)
-       stw     r6, 24(r1)
-       mflr    r3
-       stw     r7, 28(r1)
-       mfcr    r5
-       stw     r8, 32(r1)
-       stw     r9, 36(r1)
-       stw     r10,40(r1)
-       stw     r3, 44(r1)
-       stw     r5, 8(r1)
-       subi    r3, r3, MCOUNT_INSN_SIZE
-       .globl mcount_call
-mcount_call:
-       bl      ftrace_stub
-       nop
-       lwz     r6, 8(r1)
-       lwz     r0, 44(r1)
-       lwz     r3, 12(r1)
+       /*
+        * It is required that _mcount on PPC32 must preserve the
+        * link register. But we have r0 to play with. We use r0
+        * to push the return address back to the caller of mcount
+        * into the ctr register, restore the link register and
+        * then jump back using the ctr register.
+        */
+       mflr    r0
        mtctr   r0
-       lwz     r4, 16(r1)
-       mtcr    r6
-       lwz     r5, 20(r1)
-       lwz     r6, 24(r1)
-       lwz     r0, 52(r1)
-       lwz     r7, 28(r1)
-       lwz     r8, 32(r1)
+       lwz     r0, 4(r1)
        mtlr    r0
-       lwz     r9, 36(r1)
-       lwz     r10,40(r1)
-       addi    r1, r1, 48
        bctr
 
 _GLOBAL(ftrace_caller)
index e0bcf93..383ed6e 100644 (file)
@@ -894,18 +894,6 @@ _GLOBAL(enter_prom)
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
-       /* Taken from output of objdump from lib64/glibc */
-       mflr    r3
-       stdu    r1, -112(r1)
-       std     r3, 128(r1)
-       subi    r3, r3, MCOUNT_INSN_SIZE
-       .globl mcount_call
-mcount_call:
-       bl      ftrace_stub
-       nop
-       ld      r0, 128(r1)
-       mtlr    r0
-       addi    r1, r1, 112
        blr
 
 _GLOBAL(ftrace_caller)
index f4b006e..5355244 100644 (file)
@@ -9,22 +9,30 @@
 
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
+#include <asm/code-patching.h>
 #include <asm/ftrace.h>
 
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , ...)      do { } while (0)
+#endif
 
-static unsigned int ftrace_nop = 0x60000000;
+static unsigned int ftrace_nop = PPC_NOP_INSTR;
 
 #ifdef CONFIG_PPC32
 # define GET_ADDR(addr) addr
 #else
 /* PowerPC64's functions are data that points to the functions */
-# define GET_ADDR(addr) *(unsigned long *)addr
+# define GET_ADDR(addr) (*(unsigned long *)addr)
 #endif
 
 
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
        return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
+static unsigned char *ftrace_nop_replace(void)
 {
        return (char *)&ftrace_nop;
 }
 
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
        static unsigned int op;
 
@@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 # define _ASM_PTR      " .long "
 #endif
 
-int
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                   unsigned char *new_code)
 {
-       unsigned replaced;
-       unsigned old = *(unsigned *)old_code;
-       unsigned new = *(unsigned *)new_code;
-       int faulted = 0;
+       unsigned char replaced[MCOUNT_INSN_SIZE];
 
        /*
         * Note: Due to modules and __init, code can
         *  disappear and change, we need to protect against faulting
-        *  as well as code changing.
+        *  as well as code changing. We do this by using the
+        *  probe_kernel_* functions.
         *
         * No real locking needed, this code is run through
-        * kstop_machine.
+        * kstop_machine, or before SMP starts.
         */
-       asm volatile (
-               "1: lwz         %1, 0(%2)\n"
-               "   cmpw        %1, %5\n"
-               "   bne         2f\n"
-               "   stwu        %3, 0(%2)\n"
-               "2:\n"
-               ".section .fixup, \"ax\"\n"
-               "3:     li %0, 1\n"
-               "       b 2b\n"
-               ".previous\n"
-               ".section __ex_table,\"a\"\n"
-               _ASM_ALIGN "\n"
-               _ASM_PTR "1b, 3b\n"
-               ".previous"
-               : "=r"(faulted), "=r"(replaced)
-               : "r"(ip), "r"(new),
-                 "0"(faulted), "r"(old)
-               : "memory");
-
-       if (replaced != old && replaced != new)
-               faulted = 2;
-
-       if (!faulted)
-               flush_icache_range(ip, ip + 8);
-
-       return faulted;
+
+       /* read the text we want to modify */
+       if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* Make sure it is what we expect it to be */
+       if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+               return -EINVAL;
+
+       /* replace the text with the new text */
+       if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+
+/*
+ * Helper functions that are the same for both PPC64 and PPC32.
+ */
+static int test_24bit_addr(unsigned long ip, unsigned long addr)
+{
+
+       /* use the create_branch to verify that this offset can be branched */
+       return create_branch((unsigned int *)ip, addr, 0);
+}
+
+static int is_bl_op(unsigned int op)
+{
+       return (op & 0xfc000003) == 0x48000001;
+}
+
+static unsigned long find_bl_target(unsigned long ip, unsigned int op)
+{
+       static int offset;
+
+       offset = (op & 0x03fffffc);
+       /* make it signed */
+       if (offset & 0x02000000)
+               offset |= 0xfe000000;
+
+       return ip + (long)offset;
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_nop(struct module *mod,
+                 struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned int jmp[5];
+       unsigned long ptr;
+       unsigned long ip = rec->ip;
+       unsigned long tramp;
+       int offset;
+
+       /* read where this goes */
+       if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
+               return -EFAULT;
+
+       /* Make sure that that this is still a 24bit jump */
+       if (!is_bl_op(op)) {
+               printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+               return -EINVAL;
+       }
+
+       /* lets find where the pointer goes */
+       tramp = find_bl_target(ip, op);
+
+       /*
+        * On PPC64 the trampoline looks like:
+        * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high>
+        * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low>
+        *   Where the bytes 2,3,6 and 7 make up the 32bit offset
+        *   to the TOC that holds the pointer.
+        *   to jump to.
+        * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1)
+        * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12)
+        *   The actually address is 32 bytes from the offset
+        *   into the TOC.
+        * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12)
+        */
+
+       DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+
+       /* Find where the trampoline jumps to */
+       if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x", jmp[0], jmp[1]);
+
+       /* verify that this is what we expect it to be */
+       if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
+           ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+           (jmp[2] != 0xf8410028) ||
+           (jmp[3] != 0xe96c0020) ||
+           (jmp[4] != 0xe84c0028)) {
+               printk(KERN_ERR "Not a trampoline\n");
+               return -EINVAL;
+       }
+
+       offset = (unsigned)((unsigned short)jmp[0]) << 16 |
+               (unsigned)((unsigned short)jmp[1]);
+
+       DEBUGP(" %x ", offset);
+
+       /* get the address this jumps too */
+       tramp = mod->arch.toc + offset + 32;
+       DEBUGP("toc: %lx", tramp);
+
+       if (probe_kernel_read(jmp, (void *)tramp, 8)) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x\n", jmp[0], jmp[1]);
+
+       ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
+
+       /* This should match what was called */
+       if (ptr != GET_ADDR(addr)) {
+               printk(KERN_ERR "addr does not match %lx\n", ptr);
+               return -EINVAL;
+       }
+
+       /*
+        * We want to nop the line, but the next line is
+        *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1)
+        * This needs to be turned to a nop too.
+        */
+       if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       if (op != 0xe8410028) {
+               printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
+               return -EINVAL;
+       }
+
+       /*
+        * Milton Miller pointed out that we can not blindly do nops.
+        * If a task was preempted when calling a trace function,
+        * the nops will remove the way to restore the TOC in r2
+        * and the r2 TOC will get corrupted.
+        */
+
+       /*
+        * Replace:
+        *   bl <tramp>  <==== will be replaced with "b 1f"
+        *   ld r2,40(r1)
+        *  1:
+        */
+       op = 0x48000008;        /* b +8 */
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+
+#else /* !PPC64 */
+static int
+__ftrace_make_nop(struct module *mod,
+                 struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned int jmp[4];
+       unsigned long ip = rec->ip;
+       unsigned long tramp;
+
+       if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* Make sure that that this is still a 24bit jump */
+       if (!is_bl_op(op)) {
+               printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+               return -EINVAL;
+       }
+
+       /* lets find where the pointer goes */
+       tramp = find_bl_target(ip, op);
+
+       /*
+        * On PPC32 the trampoline looks like:
+        *  0x3d, 0x60, 0x00, 0x00  lis r11,sym@ha
+        *  0x39, 0x6b, 0x00, 0x00  addi r11,r11,sym@l
+        *  0x7d, 0x69, 0x03, 0xa6  mtctr r11
+        *  0x4e, 0x80, 0x04, 0x20  bctr
+        */
+
+       DEBUGP("ip:%lx jumps to %lx", ip, tramp);
+
+       /* Find where the trampoline jumps to */
+       if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x ", jmp[0], jmp[1]);
+
+       /* verify that this is what we expect it to be */
+       if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
+           ((jmp[1] & 0xffff0000) != 0x396b0000) ||
+           (jmp[2] != 0x7d6903a6) ||
+           (jmp[3] != 0x4e800420)) {
+               printk(KERN_ERR "Not a trampoline\n");
+               return -EINVAL;
+       }
+
+       tramp = (jmp[1] & 0xffff) |
+               ((jmp[0] & 0xffff) << 16);
+       if (tramp & 0x8000)
+               tramp -= 0x10000;
+
+       DEBUGP(" %x ", tramp);
+
+       if (tramp != addr) {
+               printk(KERN_ERR
+                      "Trampoline location %08lx does not match addr\n",
+                      tramp);
+               return -EINVAL;
+       }
+
+       op = PPC_NOP_INSTR;
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#endif /* PPC64 */
+
+int ftrace_make_nop(struct module *mod,
+                   struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *old, *new;
+       unsigned long ip = rec->ip;
+
+       /*
+        * If the calling address is more that 24 bits away,
+        * then we had to use a trampoline to make the call.
+        * Otherwise just update the call site.
+        */
+       if (test_24bit_addr(ip, addr)) {
+               /* within range */
+               old = ftrace_call_replace(ip, addr);
+               new = ftrace_nop_replace();
+               return ftrace_modify_code(ip, old, new);
+       }
+
+       /*
+        * Out of range jumps are called from modules.
+        * We should either already have a pointer to the module
+        * or it has been passed in.
+        */
+       if (!rec->arch.mod) {
+               if (!mod) {
+                       printk(KERN_ERR "No module loaded addr=%lx\n",
+                              addr);
+                       return -EFAULT;
+               }
+               rec->arch.mod = mod;
+       } else if (mod) {
+               if (mod != rec->arch.mod) {
+                       printk(KERN_ERR
+                              "Record mod %p not equal to passed in mod %p\n",
+                              rec->arch.mod, mod);
+                       return -EINVAL;
+               }
+               /* nothing to do if mod == rec->arch.mod */
+       } else
+               mod = rec->arch.mod;
+
+       return __ftrace_make_nop(mod, rec, addr);
+
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op[2];
+       unsigned long ip = rec->ip;
+
+       /* read where this goes */
+       if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+               return -EFAULT;
+
+       /*
+        * It should be pointing to two nops or
+        *  b +8; ld r2,40(r1)
+        */
+       if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
+           ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
+               printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+               return -EINVAL;
+       }
+
+       /* If we never set up a trampoline to ftrace_caller, then bail */
+       if (!rec->arch.mod->arch.tramp) {
+               printk(KERN_ERR "No ftrace trampoline\n");
+               return -EINVAL;
+       }
+
+       /* create the branch to the trampoline */
+       op[0] = create_branch((unsigned int *)ip,
+                             rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+       if (!op[0]) {
+               printk(KERN_ERR "REL24 out of range!\n");
+               return -EINVAL;
+       }
+
+       /* ld r2,40(r1) */
+       op[1] = 0xe8410028;
+
+       DEBUGP("write to %lx\n", rec->ip);
+
+       if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#else
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned long ip = rec->ip;
+
+       /* read where this goes */
+       if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* It should be pointing to a nop */
+       if (op != PPC_NOP_INSTR) {
+               printk(KERN_ERR "Expected NOP but have %x\n", op);
+               return -EINVAL;
+       }
+
+       /* If we never set up a trampoline to ftrace_caller, then bail */
+       if (!rec->arch.mod->arch.tramp) {
+               printk(KERN_ERR "No ftrace trampoline\n");
+               return -EINVAL;
+       }
+
+       /* create the branch to the trampoline */
+       op = create_branch((unsigned int *)ip,
+                          rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+       if (!op) {
+               printk(KERN_ERR "REL24 out of range!\n");
+               return -EINVAL;
+       }
+
+       DEBUGP("write to %lx\n", rec->ip);
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#endif /* CONFIG_PPC64 */
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *old, *new;
+       unsigned long ip = rec->ip;
+
+       /*
+        * If the calling address is more that 24 bits away,
+        * then we had to use a trampoline to make the call.
+        * Otherwise just update the call site.
+        */
+       if (test_24bit_addr(ip, addr)) {
+               /* within range */
+               old = ftrace_nop_replace();
+               new = ftrace_call_replace(ip, addr);
+               return ftrace_modify_code(ip, old, new);
+       }
+
+       /*
+        * Out of range jumps are called from modules.
+        * Being that we are converting from nop, it had better
+        * already have a module defined.
+        */
+       if (!rec->arch.mod) {
+               printk(KERN_ERR "No module loaded\n");
+               return -EINVAL;
+       }
+
+       return __ftrace_make_call(rec, addr);
 }
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-       /* This is running in kstop_machine */
+       /* caller expects data to be zero */
+       unsigned long *p = data;
 
-       ftrace_mcount_set(data);
+       *p = 0;
 
        return 0;
 }
-
index 31982d0..88d9c1d 100644 (file)
@@ -69,10 +69,15 @@ void cpu_idle(void)
                                smp_mb();
                                local_irq_disable();
 
+                               /* Don't trace irqs off for idle */
+                               stop_critical_timings();
+
                                /* check again after disabling irqs */
                                if (!need_resched() && !cpu_should_die())
                                        ppc_md.power_save();
 
+                               start_critical_timings();
+
                                local_irq_enable();
                                set_thread_flag(TIF_POLLING_NRFLAG);
 
index 2df91a0..f832773 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/ftrace.h>
 #include <linux/cache.h>
 #include <linux/bug.h>
 #include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
                        r_addend = rela[i].r_addend;
                }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       _count_relocs++;        /* add one for ftrace_caller */
+#endif
        return _count_relocs;
 }
 
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
                        return -ENOEXEC;
                }
        }
+#ifdef CONFIG_DYNAMIC_FTRACE
+       module->arch.tramp =
+               do_plt_call(module->module_core,
+                           (unsigned long)ftrace_caller,
+                           sechdrs, module);
+#endif
        return 0;
 }
index 1af2377..8992b03 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/moduleloader.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
+#include <linux/ftrace.h>
 #include <linux/bug.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
                }
        }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       /* make the trampoline to the ftrace_caller */
+       relocs++;
+#endif
+
        DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
        return relocs * sizeof(struct ppc64_stub_entry);
 }
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                }
        }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       me->arch.toc = my_r2(sechdrs, me);
+       me->arch.tramp = stub_for_addr(sechdrs,
+                                      (unsigned long)ftrace_caller,
+                                      me);
+#endif
+
        return 0;
 }
index d69912c..8db3527 100644 (file)
@@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS           += -mno-minimal-toc
 endif
 
+CFLAGS_REMOVE_code-patching.o = -pg
+CFLAGS_REMOVE_feature-fixups.o = -pg
+
 obj-y                  := string.o alloc.o \
                           checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC32)    += div64.o copy_32.o crtsavres.o
index 95f0085..279d9cc 100644 (file)
@@ -5,7 +5,6 @@
 
 /* sched_domains SD_NODE_INIT for sh machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 753346e..ae5f94d 100644 (file)
@@ -11,21 +11,21 @@ extern int get_signals(void);
 extern void block_signals(void);
 extern void unblock_signals(void);
 
-#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
+#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
                                     (flags) = get_signals(); } while(0)
-#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
+#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
                                      set_signals(flags); } while(0)
 
-#define local_irq_save(flags) do { local_save_flags(flags); \
-                                   local_irq_disable(); } while(0)
+#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
+                                   raw_local_irq_disable(); } while(0)
 
-#define local_irq_enable() unblock_signals()
-#define local_irq_disable() block_signals()
+#define raw_local_irq_enable() unblock_signals()
+#define raw_local_irq_disable() block_signals()
 
 #define irqs_disabled()                 \
 ({                                      \
         unsigned long flags;            \
-        local_save_flags(flags);        \
+        raw_local_save_flags(flags);        \
         (flags == 0);                   \
 })
 
index 7b7d276..c7235e6 100644 (file)
@@ -29,11 +29,14 @@ config X86
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FUNCTION_TRACER
+       select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_FUNCTION_TRACE_MCOUNT_TEST
        select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
        select HAVE_ARCH_KGDB if !X86_VOYAGER
        select HAVE_ARCH_TRACEHOOK
        select HAVE_GENERIC_DMA_COHERENT if X86_32
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
+       select USER_STACKTRACE_SUPPORT
 
 config ARCH_DEFCONFIG
        string
index b815664..85a7857 100644 (file)
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
 config X86_DS
        def_bool X86_PTRACE_BTS
        depends on X86_DEBUGCTLMSR
+       select HAVE_HW_BRANCH_TRACER
 
 config X86_PTRACE_BTS
        bool "Branch Trace Store"
index 2a3dfbd..fa013f5 100644 (file)
@@ -186,14 +186,10 @@ config IOMMU_LEAK
          Add a simple leak tracer to the IOMMU code. This is useful when you
          are debugging a buggy device driver that leaks IOMMU mappings.
 
-config MMIOTRACE_HOOKS
-       bool
-
 config MMIOTRACE
        bool "Memory mapped IO tracing"
        depends on DEBUG_KERNEL && PCI
        select TRACING
-       select MMIOTRACE_HOOKS
        help
          Mmiotrace traces Memory Mapped I/O access and is meant for
          debugging and reverse engineering. It is called from the ioremap
index a950084..99b6c39 100644 (file)
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
 
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 
 #ifdef CONFIG_X86_DS
 
 struct task_struct;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
 
 /*
  * Request BTS or PEBS
@@ -38,60 +44,62 @@ struct task_struct;
  * Due to alignement constraints, the actual buffer may be slightly
  * smaller than the requested or provided buffer.
  *
- * Returns 0 on success; -Eerrno otherwise
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
  *
  * task: the task to request recording for;
  *       NULL for per-cpu recording on the current cpu
  * base: the base pointer for the (non-pageable) buffer;
- *       NULL if buffer allocation requested
- * size: the size of the requested or provided buffer
+ * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  *       NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ *     -1 if no interrupt threshold is requested.
  */
-typedef void (*ds_ovfl_callback_t)(struct task_struct *);
-extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
-                         ds_ovfl_callback_t ovfl);
-extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-                          ds_ovfl_callback_t ovfl);
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+                                        void *base, size_t size,
+                                        bts_ovfl_callback_t ovfl, size_t th);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+                                          void *base, size_t size,
+                                          pebs_ovfl_callback_t ovfl,
+                                          size_t th);
 
 /*
  * Release BTS or PEBS resources
  *
- * Frees buffers allocated on ds_request.
- *
  * Returns 0 on success; -Eerrno otherwise
  *
- * task: the task to release resources for;
- *       NULL to release resources for the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_release_bts(struct task_struct *task);
-extern int ds_release_pebs(struct task_struct *task);
+extern int ds_release_bts(struct bts_tracer *tracer);
+extern int ds_release_pebs(struct pebs_tracer *tracer);
 
 /*
- * Return the (array) index of the write pointer.
+ * Get the (array) index of the write pointer.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);
 
 /*
- * Return the (array) index one record beyond the end of the array.
+ * Get the (array) index one record beyond the end of the array.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);
 
 /*
  * Provide a pointer to the BTS/PEBS record at parameter index.
@@ -102,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
  *
  * Returns the size of a single record on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * index: the index of the requested record
  * record (out): pointer to the requested record
  */
-extern int ds_access_bts(struct task_struct *task,
+extern int ds_access_bts(struct bts_tracer *tracer,
                         size_t index, const void **record);
-extern int ds_access_pebs(struct task_struct *task,
+extern int ds_access_pebs(struct pebs_tracer *tracer,
                          size_t index, const void **record);
 
 /*
@@ -129,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task,
  *
  * Returns the number of bytes written or -Eerrno.
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * buffer: the buffer to write
  * size: the size of the buffer
  */
-extern int ds_write_bts(struct task_struct *task,
+extern int ds_write_bts(struct bts_tracer *tracer,
                        const void *buffer, size_t size);
-extern int ds_write_pebs(struct task_struct *task,
+extern int ds_write_pebs(struct pebs_tracer *tracer,
                         const void *buffer, size_t size);
 
 /*
- * Same as ds_write_bts/pebs, but omit ownership checks.
- *
- * This is needed to have some other task than the owner of the
- * BTS/PEBS buffer or the parameter task itself write into the
- * respective buffer.
- */
-extern int ds_unchecked_write_bts(struct task_struct *task,
-                                 const void *buffer, size_t size);
-extern int ds_unchecked_write_pebs(struct task_struct *task,
-                                  const void *buffer, size_t size);
-
-/*
  * Reset the write pointer of the BTS/PEBS buffer.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_reset_bts(struct task_struct *task);
-extern int ds_reset_pebs(struct task_struct *task);
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
 
 /*
  * Clear the BTS/PEBS buffer and reset the write pointer.
@@ -168,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task);
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_clear_bts(struct task_struct *task);
-extern int ds_clear_pebs(struct task_struct *task);
+extern int ds_clear_bts(struct bts_tracer *tracer);
+extern int ds_clear_pebs(struct pebs_tracer *tracer);
 
 /*
  * Provide the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value (out): the counter reset value
  */
-extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
 
 /*
  * Set the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
 
 /*
  * Initialization
@@ -207,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 /*
  * The DS context - part of struct thread_struct.
  */
+#define MAX_SIZEOF_DS (12 * 8)
+
 struct ds_context {
        /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-       unsigned char *ds;
+       unsigned char ds[MAX_SIZEOF_DS];
        /* the owner of the BTS and PEBS configuration, respectively */
-       struct task_struct *owner[2];
-       /* buffer overflow notification function for BTS and PEBS */
-       ds_ovfl_callback_t callback[2];
-       /* the original buffer address */
-       void *buffer[2];
-       /* the number of allocated pages for on-request allocated buffers */
-       unsigned int pages[2];
+       struct ds_tracer  *owner[2];
        /* use count */
        unsigned long count;
        /* a pointer to the context location inside the thread_struct
index 9e8bc29..7e61b4c 100644 (file)
@@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
         */
        return addr - 1;
 }
-#endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+       /* No extra data needed for x86 */
+};
+
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+       unsigned long ret;
+       unsigned long func;
+       unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry32.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 #endif /* _ASM_X86_FTRACE_H */
index e44d379..0921b40 100644 (file)
@@ -20,6 +20,8 @@
 struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
+#include <asm/ftrace.h>
+#include <asm/atomic.h>
 
 struct thread_info {
        struct task_struct      *task;          /* main task structure */
index 35c5492..99192bb 100644 (file)
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
        int __ret_gu;                                                   \
        unsigned long __val_gu;                                         \
        __chk_user_ptr(ptr);                                            \
+       might_fault();                                                  \
        switch (sizeof(*(ptr))) {                                       \
        case 1:                                                         \
                __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
        int __ret_pu;                                           \
        __typeof__(*(ptr)) __pu_val;                            \
        __chk_user_ptr(ptr);                                    \
+       might_fault();                                          \
        __pu_val = x;                                           \
        switch (sizeof(*(ptr))) {                               \
        case 1:                                                 \
index d095a3a..5e06259 100644 (file)
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       might_sleep();
-       return __copy_to_user_inatomic(to, from, n);
+       might_fault();
+       return __copy_to_user_inatomic(to, from, n);
 }
 
 static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
 
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
                                const void __user *from, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
 
index f8cfd00..84210c4 100644 (file)
@@ -29,6 +29,8 @@ static __always_inline __must_check
 int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic(dst, (__force void *)src, size);
        switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
 int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst, src, size);
        switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
 int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst,
                                         (__force void *)src, size);
index b62a766..1cad931 100644 (file)
@@ -25,7 +25,7 @@ CFLAGS_tsc.o          := $(nostackp)
 
 obj-y                  := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y                  += time_$(BITS).o ioport.o ldt.o
+obj-y                  += time_$(BITS).o ioport.o ldt.o dumpstack.o
 obj-y                  += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
 obj-$(CONFIG_X86_32)   += probe_roms_32.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)  += apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
 obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
index 8e48c5d..88ea02d 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        unsigned int next_perf_state = 0; /* Index into perf table */
        unsigned int i;
        int result = 0;
+       struct power_trace it;
 
        dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
                }
        }
 
+       trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
        switch (data->cpu_feature) {
        case SYSTEM_INTEL_MSR_CAPABLE:
                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
index cce0b61..816f27f 100644 (file)
@@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_P4);
        if (c->x86 == 6)
                set_cpu_cap(c, X86_FEATURE_P3);
+#endif
 
        if (cpu_has_bts)
                ptrace_bts_init_intel(c);
 
-#endif
-
        detect_extended_topology(c);
        if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
                /*
index a2d1176..19a8c2c 100644 (file)
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/kernel.h>
 
 
 /*
@@ -44,6 +44,33 @@ struct ds_configuration {
 };
 static struct ds_configuration ds_cfg;
 
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+       /* the DS context (partially) owned by this tracer */
+       struct ds_context *context;
+       /* the buffer provided on ds_request() and its size in bytes */
+       void *buffer;
+       size_t size;
+};
+
+struct bts_tracer {
+       /* the common DS part */
+       struct ds_tracer ds;
+       /* buffer overflow notification function */
+       bts_ovfl_callback_t ovfl;
+};
+
+struct pebs_tracer {
+       /* the common DS part */
+       struct ds_tracer ds;
+       /* buffer overflow notification function */
+       pebs_ovfl_callback_t ovfl;
+};
 
 /*
  * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
        (*(unsigned long *)base) = value;
 }
 
+#define DS_ALIGNMENT (1 << 3)  /* BTS and PEBS buffer alignment */
 
-/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
- */
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
 /*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
+ * Locking is done only for allocating BTS or PEBS resources.
  */
-static inline int ds_validate_access(struct ds_context *context,
-                                    enum ds_qualifier qual)
-{
-       if (!context)
-               return -EPERM;
-
-       if (context->owner[qual] == current)
-               return 0;
-
-       return -EPERM;
-}
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
 
 /*
@@ -183,51 +189,13 @@ static inline int check_tracer(struct task_struct *task)
  *
  * Contexts are use-counted. They are allocated on first access and
  * deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- *   requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- *   non-existing context indicates an error. It acquires and releases
- *   the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
  */
 static DEFINE_PER_CPU(struct ds_context *, system_context);
 
 #define this_system_context per_cpu(system_context, smp_processor_id())
 
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
- */
 static inline struct ds_context *ds_get_context(struct task_struct *task)
 {
-       struct ds_context *context;
-       unsigned long irq;
-
-       spin_lock_irqsave(&ds_lock, irq);
-
-       context = (task ? task->thread.ds_ctx : this_system_context);
-       if (context)
-               context->count++;
-
-       spin_unlock_irqrestore(&ds_lock, irq);
-
-       return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
-{
        struct ds_context **p_context =
                (task ? &task->thread.ds_ctx : &this_system_context);
        struct ds_context *context = *p_context;
@@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
                if (!context)
                        return NULL;
 
-               context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
-               if (!context->ds) {
-                       kfree(context);
-                       return NULL;
-               }
-
                spin_lock_irqsave(&ds_lock, irq);
 
                if (*p_context) {
-                       kfree(context->ds);
                        kfree(context);
 
                        context = *p_context;
@@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
        return context;
 }
 
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
 static inline void ds_put_context(struct ds_context *context)
 {
        unsigned long irq;
@@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context)
        if (!context->task || (context->task == current))
                wrmsrl(MSR_IA32_DS_AREA, 0);
 
-       put_tracer(context->task);
-
-       /* free any leftover buffers from tracers that did not
-        * deallocate them properly. */
-       kfree(context->buffer[ds_bts]);
-       kfree(context->buffer[ds_pebs]);
-       kfree(context->ds);
        kfree(context);
  out:
        spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context)
 /*
  * Handle a buffer overflow
  *
- * task: the task whose buffers are overflowing;
- *       NULL for a buffer overflow on the current cpu
  * context: the ds context
  * qual: the buffer type
  */
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
-                       enum ds_qualifier qual)
-{
-       if (!context)
-               return;
-
-       if (context->callback[qual])
-               (*context->callback[qual])(task);
-
-       /* todo: do some more overflow handling */
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
+{
+       switch (qual) {
+       case ds_bts: {
+               struct bts_tracer *tracer =
+                       container_of(context->owner[qual],
+                                    struct bts_tracer, ds);
+               if (tracer->ovfl)
+                       tracer->ovfl(tracer);
+       }
+               break;
+       case ds_pebs: {
+               struct pebs_tracer *tracer =
+                       container_of(context->owner[qual],
+                                    struct pebs_tracer, ds);
+               if (tracer->ovfl)
+                       tracer->ovfl(tracer);
+       }
+               break;
+       }
 }
 
 
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- *         NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static void ds_install_ds_config(struct ds_context *context,
+                                enum ds_qualifier qual,
+                                void *base, size_t size, size_t ith)
 {
-       unsigned long rlim, vm, pgsz;
-       void *buffer;
-
-       pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-       rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->total_vm  + pgsz;
-       if (rlim < vm)
-               return NULL;
+       unsigned long buffer, adj;
 
-       rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->locked_vm  + pgsz;
-       if (rlim < vm)
-               return NULL;
+       /* adjust the buffer address and size to meet alignment
+        * constraints:
+        * - buffer is double-word aligned
+        * - size is multiple of record size
+        *
+        * We checked the size at the very beginning; we have enough
+        * space to do the adjustment.
+        */
+       buffer = (unsigned long)base;
 
-       buffer = kzalloc(size, GFP_KERNEL);
-       if (!buffer)
-               return NULL;
+       adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
+       buffer += adj;
+       size   -= adj;
 
-       current->mm->total_vm  += pgsz;
-       current->mm->locked_vm += pgsz;
+       size /= ds_cfg.sizeof_rec[qual];
+       size *= ds_cfg.sizeof_rec[qual];
 
-       if (pages)
-               *pages = pgsz;
+       ds_set(context->ds, qual, ds_buffer_base, buffer);
+       ds_set(context->ds, qual, ds_index, buffer);
+       ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
 
-       return buffer;
+       /* The value for 'no threshold' is -1, which will set the
+        * threshold outside of the buffer, just like we want it.
+        */
+       ds_set(context->ds, qual,
+              ds_interrupt_threshold, buffer + size - ith);
 }
 
-static int ds_request(struct task_struct *task, void *base, size_t size,
-                     ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
+                     struct task_struct *task,
+                     void *base, size_t size, size_t th)
 {
        struct ds_context *context;
-       unsigned long buffer, adj;
-       const unsigned long alignment = (1 << 3);
        unsigned long irq;
-       int error = 0;
+       int error;
 
+       error = -EOPNOTSUPP;
        if (!ds_cfg.sizeof_ds)
-               return -EOPNOTSUPP;
+               goto out;
+
+       error = -EINVAL;
+       if (!base)
+               goto out;
 
        /* we require some space to do alignment adjustments below */
-       if (size < (alignment + ds_cfg.sizeof_rec[qual]))
-               return -EINVAL;
+       error = -EINVAL;
+       if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+               goto out;
 
-       /* buffer overflow notification is not yet implemented */
-       if (ovfl)
-               return -EOPNOTSUPP;
+       if (th != (size_t)-1) {
+               th *= ds_cfg.sizeof_rec[qual];
+
+               error = -EINVAL;
+               if (size <= th)
+                       goto out;
+       }
 
+       tracer->buffer = base;
+       tracer->size = size;
 
-       context = ds_alloc_context(task);
+       error = -ENOMEM;
+       context = ds_get_context(task);
        if (!context)
-               return -ENOMEM;
+               goto out;
+       tracer->context = context;
+
 
        spin_lock_irqsave(&ds_lock, irq);
 
        error = -EPERM;
        if (!check_tracer(task))
                goto out_unlock;
-
        get_tracer(task);
 
-       error = -EALREADY;
-       if (context->owner[qual] == current)
-               goto out_put_tracer;
        error = -EPERM;
-       if (context->owner[qual] != NULL)
+       if (context->owner[qual])
                goto out_put_tracer;
-       context->owner[qual] = current;
+       context->owner[qual] = tracer;
 
        spin_unlock_irqrestore(&ds_lock, irq);
 
 
-       error = -ENOMEM;
-       if (!base) {
-               base = ds_allocate_buffer(size, &context->pages[qual]);
-               if (!base)
-                       goto out_release;
-
-               context->buffer[qual]   = base;
-       }
-       error = 0;
+       ds_install_ds_config(context, qual, base, size, th);
 
-       context->callback[qual] = ovfl;
-
-       /* adjust the buffer address and size to meet alignment
-        * constraints:
-        * - buffer is double-word aligned
-        * - size is multiple of record size
-        *
-        * We checked the size at the very beginning; we have enough
-        * space to do the adjustment.
-        */
-       buffer = (unsigned long)base;
-
-       adj = ALIGN(buffer, alignment) - buffer;
-       buffer += adj;
-       size   -= adj;
-
-       size /= ds_cfg.sizeof_rec[qual];
-       size *= ds_cfg.sizeof_rec[qual];
-
-       ds_set(context->ds, qual, ds_buffer_base, buffer);
-       ds_set(context->ds, qual, ds_index, buffer);
-       ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
-
-       if (ovfl) {
-               /* todo: select a suitable interrupt threshold */
-       } else
-               ds_set(context->ds, qual,
-                      ds_interrupt_threshold, buffer + size + 1);
-
-       /* we keep the context until ds_release */
-       return error;
-
- out_release:
-       context->owner[qual] = NULL;
-       ds_put_context(context);
-       put_tracer(task);
-       return error;
+       return 0;
 
  out_put_tracer:
-       spin_unlock_irqrestore(&ds_lock, irq);
-       ds_put_context(context);
        put_tracer(task);
-       return error;
-
  out_unlock:
        spin_unlock_irqrestore(&ds_lock, irq);
        ds_put_context(context);
+       tracer->context = NULL;
+ out:
        return error;
 }
 
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
-                  ds_ovfl_callback_t ovfl)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+                                 void *base, size_t size,
+                                 bts_ovfl_callback_t ovfl, size_t th)
 {
-       return ds_request(task, base, size, ovfl, ds_bts);
-}
+       struct bts_tracer *tracer;
+       int error;
 
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-                   ds_ovfl_callback_t ovfl)
-{
-       return ds_request(task, base, size, ovfl, ds_pebs);
+       /* buffer overflow notification is not yet implemented */
+       error = -EOPNOTSUPP;
+       if (ovfl)
+               goto out;
+
+       error = -ENOMEM;
+       tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+       if (!tracer)
+               goto out;
+       tracer->ovfl = ovfl;
+
+       error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+       if (error < 0)
+               goto out_tracer;
+
+       return tracer;
+
+ out_tracer:
+       kfree(tracer);
+ out:
+       return ERR_PTR(error);
 }
 
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+                                   void *base, size_t size,
+                                   pebs_ovfl_callback_t ovfl, size_t th)
 {
-       struct ds_context *context;
+       struct pebs_tracer *tracer;
        int error;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
+       /* buffer overflow notification is not yet implemented */
+       error = -EOPNOTSUPP;
+       if (ovfl)
                goto out;
 
-       kfree(context->buffer[qual]);
-       context->buffer[qual] = NULL;
+       error = -ENOMEM;
+       tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+       if (!tracer)
+               goto out;
+       tracer->ovfl = ovfl;
 
-       current->mm->total_vm  -= context->pages[qual];
-       current->mm->locked_vm -= context->pages[qual];
-       context->pages[qual] = 0;
-       context->owner[qual] = NULL;
+       error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+       if (error < 0)
+               goto out_tracer;
 
-       /*
-        * we put the context twice:
-        *   once for the ds_get_context
-        *   once for the corresponding ds_request
-        */
-       ds_put_context(context);
+       return tracer;
+
+ out_tracer:
+       kfree(tracer);
  out:
-       ds_put_context(context);
-       return error;
+       return ERR_PTR(error);
 }
 
-int ds_release_bts(struct task_struct *task)
+static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
 {
-       return ds_release(task, ds_bts);
+       BUG_ON(tracer->context->owner[qual] != tracer);
+       tracer->context->owner[qual] = NULL;
+
+       put_tracer(tracer->context->task);
+       ds_put_context(tracer->context);
 }
 
-int ds_release_pebs(struct task_struct *task)
+int ds_release_bts(struct bts_tracer *tracer)
 {
-       return ds_release(task, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_release(&tracer->ds, ds_bts);
+       kfree(tracer);
+
+       return 0;
 }
 
-static int ds_get_index(struct task_struct *task, size_t *pos,
-                       enum ds_qualifier qual)
+int ds_release_pebs(struct pebs_tracer *tracer)
 {
-       struct ds_context *context;
-       unsigned long base, index;
-       int error;
+       if (!tracer)
+               return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
+       ds_release(&tracer->ds, ds_pebs);
+       kfree(tracer);
+
+       return 0;
+}
+
+static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
+{
+       unsigned long base, index;
 
        base  = ds_get(context->ds, qual, ds_buffer_base);
        index = ds_get(context->ds, qual, ds_index);
 
-       error = ((index - base) / ds_cfg.sizeof_rec[qual]);
-       if (pos)
-               *pos = error;
- out:
-       ds_put_context(context);
-       return error;
+       return (index - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
 {
-       return ds_get_index(task, pos, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_index(tracer->ds.context, ds_bts);
+
+       return 0;
 }
 
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
 {
-       return ds_get_index(task, pos, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_index(tracer->ds.context, ds_pebs);
+
+       return 0;
 }
 
-static int ds_get_end(struct task_struct *task, size_t *pos,
-                     enum ds_qualifier qual)
+static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
 {
-       struct ds_context *context;
-       unsigned long base, end;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
+       unsigned long base, max;
 
        base = ds_get(context->ds, qual, ds_buffer_base);
-       end  = ds_get(context->ds, qual, ds_absolute_maximum);
+       max  = ds_get(context->ds, qual, ds_absolute_maximum);
 
-       error = ((end - base) / ds_cfg.sizeof_rec[qual]);
-       if (pos)
-               *pos = error;
- out:
-       ds_put_context(context);
-       return error;
+       return (max - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
 {
-       return ds_get_end(task, pos, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_end(tracer->ds.context, ds_bts);
+
+       return 0;
 }
 
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
 {
-       return ds_get_end(task, pos, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_end(tracer->ds.context, ds_pebs);
+
+       return 0;
 }
 
-static int ds_access(struct task_struct *task, size_t index,
-                    const void **record, enum ds_qualifier qual)
+static int ds_access(struct ds_context *context, enum ds_qualifier qual,
+                    size_t index, const void **record)
 {
-       struct ds_context *context;
        unsigned long base, idx;
-       int error;
 
        if (!record)
                return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
-
        base = ds_get(context->ds, qual, ds_buffer_base);
        idx = base + (index * ds_cfg.sizeof_rec[qual]);
 
-       error = -EINVAL;
        if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
-               goto out;
+               return -EINVAL;
 
        *record = (const void *)idx;
-       error = ds_cfg.sizeof_rec[qual];
- out:
-       ds_put_context(context);
-       return error;
+
+       return ds_cfg.sizeof_rec[qual];
 }
 
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
+int ds_access_bts(struct bts_tracer *tracer, size_t index,
+                 const void **record)
 {
-       return ds_access(task, index, record, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       return ds_access(tracer->ds.context, ds_bts, index, record);
 }
 
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
+                  const void **record)
 {
-       return ds_access(task, index, record, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       return ds_access(tracer->ds.context, ds_pebs, index, record);
 }
 
-static int ds_write(struct task_struct *task, const void *record, size_t size,
-                   enum ds_qualifier qual, int force)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+                   const void *record, size_t size)
 {
-       struct ds_context *context;
-       int error;
+       int bytes_written = 0;
 
        if (!record)
                return -EINVAL;
 
-       error = -EPERM;
-       context = ds_get_context(task);
-       if (!context)
-               goto out;
-
-       if (!force) {
-               error = ds_validate_access(context, qual);
-               if (error < 0)
-                       goto out;
-       }
-
-       error = 0;
        while (size) {
                unsigned long base, index, end, write_end, int_th;
                unsigned long write_size, adj_write_size;
@@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
                        write_end = end;
 
                if (write_end <= index)
-                       goto out;
+                       break;
 
                write_size = min((unsigned long) size, write_end - index);
                memcpy((void *)index, record, write_size);
 
                record = (const char *)record + write_size;
-               size  -= write_size;
-               error += write_size;
+               size -= write_size;
+               bytes_written += write_size;
 
                adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
                adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
                ds_set(context->ds, qual, ds_index, index);
 
                if (index >= int_th)
-                       ds_overflow(task, context, qual);
+                       ds_overflow(context, qual);
        }
 
- out:
-       ds_put_context(context);
-       return error;
+       return bytes_written;
 }
 
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
 {
-       return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+       if (!tracer)
+               return -EINVAL;
 
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
-       return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+       return ds_write(tracer->ds.context, ds_bts, record, size);
 }
 
-int ds_unchecked_write_bts(struct task_struct *task,
-                          const void *record, size_t size)
+int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
 {
-       return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+       if (!tracer)
+               return -EINVAL;
 
-int ds_unchecked_write_pebs(struct task_struct *task,
-                           const void *record, size_t size)
-{
-       return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+       return ds_write(tracer->ds.context, ds_pebs, record, size);
 }
 
-static int ds_reset_or_clear(struct task_struct *task,
-                            enum ds_qualifier qual, int clear)
+static void ds_reset_or_clear(struct ds_context *context,
+                             enum ds_qualifier qual, int clear)
 {
-       struct ds_context *context;
        unsigned long base, end;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
 
        base = ds_get(context->ds, qual, ds_buffer_base);
        end  = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task,
                memset((void *)base, 0, end - base);
 
        ds_set(context->ds, qual, ds_index, base);
-
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
 }
 
-int ds_reset_bts(struct task_struct *task)
+int ds_reset_bts(struct bts_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+
+       return 0;
 }
 
-int ds_reset_pebs(struct task_struct *task)
+int ds_reset_pebs(struct pebs_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+
+       return 0;
 }
 
-int ds_clear_bts(struct task_struct *task)
+int ds_clear_bts(struct bts_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
+
+       return 0;
 }
 
-int ds_clear_pebs(struct task_struct *task)
+int ds_clear_pebs(struct pebs_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+
+       return 0;
 }
 
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
 {
-       struct ds_context *context;
-       int error;
+       if (!tracer)
+               return -EINVAL;
 
        if (!value)
                return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, ds_pebs);
-       if (error < 0)
-               goto out;
+       *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
 
-       *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
-
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
+       return 0;
 }
 
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 {
-       struct ds_context *context;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, ds_pebs);
-       if (error < 0)
-               goto out;
+       if (!tracer)
+               return -EINVAL;
 
-       *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+       *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
 
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
+       return 0;
 }
 
 static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +771,10 @@ static inline void
 ds_configure(const struct ds_configuration *cfg)
 {
        ds_cfg = *cfg;
+
+       printk(KERN_INFO "DS available\n");
+
+       BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
        switch (c->x86) {
        case 0x6:
                switch (c->x86_model) {
+               case 0 ... 0xC:
+                       /* sorry, don't know about them */
+                       break;
                case 0xD:
                case 0xE: /* Pentium M */
                        ds_configure(&ds_cfg_var);
                        break;
-               case 0xF: /* Core2 */
-               case 0x1C: /* Atom */
+               default: /* Core2, Atom, ... */
                        ds_configure(&ds_cfg_64);
                        break;
-               default:
-                       /* sorry, don't know about them */
-                       break;
                }
                break;
        case 0xF:
@@ -884,6 +818,8 @@ void ds_free(struct ds_context *context)
         * is dying. There should not be any user of that context left
         * to disturb us, anymore. */
        unsigned long leftovers = context->count;
-       while (leftovers--)
+       while (leftovers--) {
+               put_tracer(context->task);
                ds_put_context(context);
+       }
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644 (file)
index 0000000..6b1f6f6
--- /dev/null
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+       printk(" [<%p>] %s%pS\n", (void *) address,
+                       reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+                       const struct stacktrace_ops *ops,
+                       struct thread_info *tinfo, int *graph)
+{
+       struct task_struct *task = tinfo->task;
+       unsigned long ret_addr;
+       int index = task->curr_ret_stack;
+
+       if (addr != (unsigned long)return_to_handler)
+               return;
+
+       if (!task->ret_stack || index < *graph)
+               return;
+
+       index -= *graph;
+       ret_addr = task->ret_stack[index].ret;
+
+       ops->address(data, ret_addr, 1);
+
+       (*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+                       const struct stacktrace_ops *ops,
+                       struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+                       void *p, unsigned int size, void *end)
+{
+       void *t = tinfo;
+       if (end) {
+               if (p < end && p >= (end-THREAD_SIZE))
+                       return 1;
+               else
+                       return 0;
+       }
+       return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+               unsigned long *stack, unsigned long bp,
+               const struct stacktrace_ops *ops, void *data,
+               unsigned long *end, int *graph)
+{
+       struct stack_frame *frame = (struct stack_frame *)bp;
+
+       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+               unsigned long addr;
+
+               addr = *stack;
+               if (__kernel_text_address(addr)) {
+                       if ((unsigned long) stack == bp + sizeof(long)) {
+                               ops->address(data, addr, 1);
+                               frame = frame->next_frame;
+                               bp = (unsigned long) frame;
+                       } else {
+                               ops->address(data, addr, bp == 0);
+                       }
+                       print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+               }
+               stack++;
+       }
+       return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       printk(data);
+       print_symbol(msg, symbol);
+       printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+       printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+       printk("%s <%s> ", (char *)data, name);
+       return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+       touch_nmi_watchdog();
+       printk(data);
+       printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+       .warning = print_trace_warning,
+       .warning_symbol = print_trace_warning_symbol,
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+       printk("%sCall Trace:\n", log_lvl);
+       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp)
+{
+       show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+       show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+       unsigned long bp = 0;
+       unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+       if (!bp)
+               get_bp(bp);
+#endif
+
+       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+               current->pid, current->comm, print_tainted(),
+               init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+       show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+       int cpu;
+       unsigned long flags;
+
+       oops_enter();
+
+       /* racy, but better than risking deadlock. */
+       raw_local_irq_save(flags);
+       cpu = smp_processor_id();
+       if (!__raw_spin_trylock(&die_lock)) {
+               if (cpu == die_owner)
+                       /* nested oops. should stop eventually */;
+               else
+                       __raw_spin_lock(&die_lock);
+       }
+       die_nest_count++;
+       die_owner = cpu;
+       console_verbose();
+       bust_spinlocks(1);
+       return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+       if (regs && kexec_should_crash(current))
+               crash_kexec(regs);
+
+       bust_spinlocks(0);
+       die_owner = -1;
+       add_taint(TAINT_DIE);
+       die_nest_count--;
+       if (!die_nest_count)
+               /* Nest count reaches zero, release the lock. */
+               __raw_spin_unlock(&die_lock);
+       raw_local_irq_restore(flags);
+       oops_exit();
+
+       if (!signr)
+               return;
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+       if (panic_on_oops)
+               panic("Fatal exception");
+       do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+       unsigned short ss;
+       unsigned long sp;
+#endif
+       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+       printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+       printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk("DEBUG_PAGEALLOC");
+#endif
+       printk("\n");
+       sysfs_printk_last_file();
+       if (notify_die(DIE_OOPS, str, regs, err,
+                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+               return 1;
+
+       show_registers(regs);
+#ifdef CONFIG_X86_32
+       sp = (unsigned long) (&regs->sp);
+       savesegment(ss, ss);
+       if (user_mode(regs)) {
+               sp = regs->sp;
+               ss = regs->ss & 0xffff;
+       }
+       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+       print_symbol("%s", regs->ip);
+       printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+       /* Executive summary in case the oops scrolled away */
+       printk(KERN_ALERT "RIP ");
+       printk_address(regs->ip, 1);
+       printk(" RSP <%016lx>\n", regs->sp);
+#endif
+       return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+       unsigned long flags = oops_begin();
+       int sig = SIGSEGV;
+
+       if (!user_mode_vm(regs))
+               report_bug(regs->ip, regs);
+
+       if (__die(str, regs, err))
+               sig = 0;
+       oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+       unsigned long flags;
+
+       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+               return;
+
+       /*
+        * We are in trouble anyway, lets at least try
+        * to get a message out.
+        */
+       flags = oops_begin();
+       printk(KERN_EMERG "%s", str);
+       printk(" on CPU%d, ip %08lx, registers:\n",
+               smp_processor_id(), regs->ip);
+       show_registers(regs);
+       oops_end(flags, regs, 0);
+       if (do_panic || panic_on_oops)
+               panic("Non maskable interrupt");
+       nmi_exit();
+       local_irq_enable();
+       do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       if (!strcmp(s, "panic"))
+               panic_on_oops = 1;
+       return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+       return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+       code_bytes = simple_strtoul(s, NULL, 0);
+       if (code_bytes > 8192)
+               code_bytes = 8192;
+
+       return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644 (file)
index 0000000..da87590
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+               unsigned long *stack, unsigned long bp,
+               const struct stacktrace_ops *ops, void *data,
+               unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+       struct stack_frame *next_frame;
+       unsigned long return_address;
+};
+#endif
index b361475..d593cd1 100644 (file)
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-       printk(" [<%p>] %s%pS\n", (void *) address,
-                       reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-                       void *p, unsigned int size, void *end)
-{
-       void *t = tinfo;
-       if (end) {
-               if (p < end && p >= (end-THREAD_SIZE))
-                       return 1;
-               else
-                       return 0;
-       }
-       return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-       struct stack_frame *next_frame;
-       unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data,
-               unsigned long *end)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
-
-       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-               unsigned long addr;
-
-               addr = *stack;
-               if (__kernel_text_address(addr)) {
-                       if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
-                               frame = frame->next_frame;
-                               bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, bp == 0);
-                       }
-               }
-               stack++;
-       }
-       return bp;
-}
+#include "dumpstack.h"
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
                unsigned long *stack, unsigned long bp,
                const struct stacktrace_ops *ops, void *data)
 {
+       int graph = 0;
+
        if (!task)
                task = current;
 
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
                context = (struct thread_info *)
                        ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-               bp = print_context_stack(context, stack, bp, ops, data, NULL);
+               bp = print_context_stack(context, stack, bp, ops,
+                                        data, NULL, &graph);
 
                stack = (unsigned long *)context->previous_esp;
                if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       printk(data);
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk("%s <%s> ", (char *)data, name);
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-       touch_nmi_watchdog();
-       printk(data);
-       printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-       printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-       show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long bp = 0;
-       unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp)
-               get_bp(bp);
-#endif
-
-       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-               current->pid, current->comm, print_tainted(),
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
 
 void show_registers(struct pt_regs *regs)
 {
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
        return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-       unsigned long flags;
-
-       oops_enter();
-
-       if (die_owner != raw_smp_processor_id()) {
-               console_verbose();
-               raw_local_irq_save(flags);
-               __raw_spin_lock(&die_lock);
-               die_owner = smp_processor_id();
-               die_nest_count = 0;
-               bust_spinlocks(1);
-       } else {
-               raw_local_irq_save(flags);
-       }
-       die_nest_count++;
-       return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-       bust_spinlocks(0);
-       die_owner = -1;
-       add_taint(TAINT_DIE);
-       __raw_spin_unlock(&die_lock);
-       raw_local_irq_restore(flags);
-
-       if (!regs)
-               return;
-
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-       if (panic_on_oops)
-               panic("Fatal exception");
-       oops_exit();
-       do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned short ss;
-       unsigned long sp;
-
-       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-       printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-       printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       printk("DEBUG_PAGEALLOC");
-#endif
-       printk("\n");
-       sysfs_printk_last_file();
-       if (notify_die(DIE_OOPS, str, regs, err,
-                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-               return 1;
-
-       show_registers(regs);
-       /* Executive summary in case the oops scrolled away */
-       sp = (unsigned long) (&regs->sp);
-       savesegment(ss, ss);
-       if (user_mode(regs)) {
-               sp = regs->sp;
-               ss = regs->ss & 0xffff;
-       }
-       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-       print_symbol("%s", regs->ip);
-       printk(" SS:ESP %04x:%08lx\n", ss, sp);
-       return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned long flags = oops_begin();
-
-       if (die_nest_count < 3) {
-               report_bug(regs->ip, regs);
-
-               if (__die(str, regs, err))
-                       regs = NULL;
-       } else {
-               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
-       }
-
-       oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-               return;
-
-       spin_lock(&nmi_print_lock);
-       /*
-       * We are in trouble anyway, lets at least try
-       * to get a message out:
-       */
-       bust_spinlocks(1);
-       printk(KERN_EMERG "%s", str);
-       printk(" on CPU%d, ip %08lx, registers:\n",
-               smp_processor_id(), regs->ip);
-       show_registers(regs);
-       if (do_panic)
-               panic("Non maskable interrupt");
-       console_silent();
-       spin_unlock(&nmi_print_lock);
-
-       /*
-        * If we are in kernel we are probably nested up pretty bad
-        * and might aswell get out now while we still can:
-        */
-       if (!user_mode_vm(regs)) {
-               current->thread.trap_no = 2;
-               crash_kexec(regs);
-       }
-
-       bust_spinlocks(0);
-       do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       if (!strcmp(s, "panic"))
-               panic_on_oops = 1;
-       return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-       return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-       code_bytes = simple_strtoul(s, NULL, 0);
-       if (code_bytes > 8192)
-               code_bytes = 8192;
-
-       return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
index 96a5db7..c302d07 100644 (file)
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-       printk(" [<%p>] %s%pS\n", (void *) address,
-                       reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                                        unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-                       void *p, unsigned int size, void *end)
-{
-       void *t = tinfo;
-       if (end) {
-               if (p < end && p >= (end-THREAD_SIZE))
-                       return 1;
-               else
-                       return 0;
-       }
-       return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-       struct stack_frame *next_frame;
-       unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data,
-               unsigned long *end)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
-
-       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-               unsigned long addr;
-
-               addr = *stack;
-               if (__kernel_text_address(addr)) {
-                       if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
-                               frame = frame->next_frame;
-                               bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, bp == 0);
-                       }
-               }
-               stack++;
-       }
-       return bp;
-}
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
                unsigned long *stack, unsigned long bp,
                const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
        unsigned used = 0;
        struct thread_info *tinfo;
+       int graph = 0;
 
        if (!task)
                task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                                break;
 
                        bp = print_context_stack(tinfo, stack, bp, ops,
-                                                       data, estack_end);
+                                                data, estack_end, &graph);
                        ops->stack(data, "<EOE>");
                        /*
                         * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                                if (ops->stack(data, "IRQ") < 0)
                                        break;
                                bp = print_context_stack(tinfo, stack, bp,
-                                               ops, data, irqstack_end);
+                                       ops, data, irqstack_end, &graph);
                                /*
                                 * We link to the next stack (which would be
                                 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        /*
         * This handles the process stack:
         */
-       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
        put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       printk(data);
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk("%s <%s> ", (char *)data, name);
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-       touch_nmi_watchdog();
-       printk(data);
-       printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-       printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-       show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long bp = 0;
-       unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp)
-               get_bp(bp);
-#endif
-
-       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-               current->pid, current->comm, print_tainted(),
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_registers(struct pt_regs *regs)
 {
        int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
        return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-       int cpu;
-       unsigned long flags;
-
-       oops_enter();
-
-       /* racy, but better than risking deadlock. */
-       raw_local_irq_save(flags);
-       cpu = smp_processor_id();
-       if (!__raw_spin_trylock(&die_lock)) {
-               if (cpu == die_owner)
-                       /* nested oops. should stop eventually */;
-               else
-                       __raw_spin_lock(&die_lock);
-       }
-       die_nest_count++;
-       die_owner = cpu;
-       console_verbose();
-       bust_spinlocks(1);
-       return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-       die_owner = -1;
-       bust_spinlocks(0);
-       die_nest_count--;
-       if (!die_nest_count)
-               /* Nest count reaches zero, release the lock. */
-               __raw_spin_unlock(&die_lock);
-       raw_local_irq_restore(flags);
-       if (!regs) {
-               oops_exit();
-               return;
-       }
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-       if (panic_on_oops)
-               panic("Fatal exception");
-       oops_exit();
-       do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-       printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-       printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       printk("DEBUG_PAGEALLOC");
-#endif
-       printk("\n");
-       sysfs_printk_last_file();
-       if (notify_die(DIE_OOPS, str, regs, err,
-                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-               return 1;
-
-       show_registers(regs);
-       add_taint(TAINT_DIE);
-       /* Executive summary in case the oops scrolled away */
-       printk(KERN_ALERT "RIP ");
-       printk_address(regs->ip, 1);
-       printk(" RSP <%016lx>\n", regs->sp);
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned long flags = oops_begin();
-
-       if (!user_mode(regs))
-               report_bug(regs->ip, regs);
-
-       if (__die(str, regs, err))
-               regs = NULL;
-       oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-       unsigned long flags;
-
-       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-               return;
-
-       flags = oops_begin();
-       /*
-        * We are in trouble anyway, lets at least try
-        * to get a message out.
-        */
-       printk(KERN_EMERG "%s", str);
-       printk(" on CPU%d, ip %08lx, registers:\n",
-               smp_processor_id(), regs->ip);
-       show_registers(regs);
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       if (do_panic || panic_on_oops)
-               panic("Non maskable interrupt");
-       oops_end(flags, NULL, SIGBUS);
-       nmi_exit();
-       local_irq_enable();
-       do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       if (!strcmp(s, "panic"))
-               panic_on_oops = 1;
-       return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-       return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-       code_bytes = simple_strtoul(s, NULL, 0);
-       if (code_bytes > 8192)
-               code_bytes = 8192;
-
-       return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
index 28b597e..43ceb3f 100644 (file)
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        pushl %eax
        pushl %ecx
        pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
        popl %edx
        popl %ecx
        popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+       jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        cmpl $ftrace_stub, ftrace_trace_function
        jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpl $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
 .globl ftrace_stub
 ftrace_stub:
        ret
@@ -1200,12 +1218,43 @@ trace:
        popl %edx
        popl %ecx
        popl %eax
-
        jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %edx
+       lea 0x4(%ebp), %eax
+       subl $MCOUNT_INSN_SIZE, %edx
+       call prepare_ftrace_return
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+       pushl $0
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       call ftrace_return_to_handler
+       movl %eax, 0xc(%esp)
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
index b86f332..54e0bbd 100644 (file)
@@ -68,6 +68,8 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
 
        /* taken from glibc */
        subq $0x38, %rsp
@@ -96,6 +98,12 @@ ftrace_call:
        movq (%rsp), %rax
        addq $0x38, %rsp
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+       jmp ftrace_stub
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
        retq
@@ -103,8 +111,20 @@ END(ftrace_caller)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        cmpq $ftrace_stub, ftrace_trace_function
        jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpq $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
        retq
@@ -140,6 +160,69 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+
+       leaq 8(%rbp), %rdi
+       movq 0x38(%rsp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rsi
+
+       call    prepare_ftrace_return
+
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+       retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+       subq  $80, %rsp
+
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+       movq %r10, 56(%rsp)
+       movq %r11, 64(%rsp)
+
+       call ftrace_return_to_handler
+
+       movq %rax, 72(%rsp)
+       movq 64(%rsp), %r11
+       movq 56(%rsp), %r10
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $72, %rsp
+       retq
+#endif
+
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif 
index 50ea0ac..1b43086 100644 (file)
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
        char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
        } __attribute__((packed));
 };
 
-
 static int ftrace_calc_offset(long ip, long addr)
 {
        return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
-{
-       return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
        static union ftrace_code_union calc;
 
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
        return calc.code;
 }
 
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;            /* holds return value of text write */
+static int mod_code_write;             /* set when NMI should do the write */
+static void *mod_code_ip;              /* holds the IP to write to */
+static void *mod_code_newcode;         /* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+       int r;
+
+       r = snprintf(buf, size, "%u %u",
+                    nmi_wait_count,
+                    atomic_read(&nmi_update_count));
+       return r;
+}
+
+static void ftrace_mod_code(void)
+{
+       /*
+        * Yes, more than one CPU process can be writing to mod_code_status.
+        *    (and the code itself)
+        * But if one were to fail, then they all should, and if one were
+        * to succeed, then they all should.
+        */
+       mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+                                            MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+       atomic_inc(&in_nmi);
+       /* Must have in_nmi seen before reading write flag */
+       smp_mb();
+       if (mod_code_write) {
+               ftrace_mod_code();
+               atomic_inc(&nmi_update_count);
+       }
+}
+
+void ftrace_nmi_exit(void)
+{
+       /* Finish all executions before clearing in_nmi */
+       smp_wmb();
+       atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+       int waited = 0;
+
+       while (atomic_read(&in_nmi)) {
+               waited = 1;
+               cpu_relax();
+       }
+
+       if (waited)
+               nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+       mod_code_ip = (void *)ip;
+       mod_code_newcode = new_code;
+
+       /* The buffers need to be visible before we let NMIs write them */
+       smp_wmb();
+
+       mod_code_write = 1;
+
+       /* Make sure write bit is visible before we wait on NMIs */
+       smp_mb();
+
+       wait_for_nmi();
+
+       /* Make sure all running NMIs have finished before we write the code */
+       smp_mb();
+
+       ftrace_mod_code();
+
+       /* Make sure the write happens before clearing the bit */
+       smp_wmb();
+
+       mod_code_write = 0;
+
+       /* make sure NMIs see the cleared bit */
+       smp_mb();
+
+       wait_for_nmi();
+
+       return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+       return ftrace_nop;
+}
+
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                   unsigned char *new_code)
 {
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                return -EINVAL;
 
        /* replace the text with the new text */
-       if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+       if (do_ftrace_mod_code(ip, new_code))
                return -EPERM;
 
        sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
        return 0;
 }
 
+int ftrace_make_nop(struct module *mod,
+                   struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *new, *old;
+       unsigned long ip = rec->ip;
+
+       old = ftrace_call_replace(ip, addr);
+       new = ftrace_nop_replace();
+
+       return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *new, *old;
+       unsigned long ip = rec->ip;
+
+       old = ftrace_nop_replace();
+       new = ftrace_call_replace(ip, addr);
+
+       return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
        unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
 
        return 0;
 }
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+                         int old_offset, int new_offset)
+{
+       unsigned char code[MCOUNT_INSN_SIZE];
+
+       if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+               return -EINVAL;
+
+       *(int *)(&code[1]) = new_offset;
+
+       if (do_ftrace_mod_code(ip, &code))
+               return -EPERM;
+
+       return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+       unsigned long ip = (unsigned long)(&ftrace_graph_call);
+       int old_offset, new_offset;
+
+       old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+       new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+       return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+       unsigned long ip = (unsigned long)(&ftrace_graph_call);
+       int old_offset, new_offset;
+
+       old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+       new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+       return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+       atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+       atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+                               unsigned long func, int *depth)
+{
+       int index;
+
+       if (!current->ret_stack)
+               return -EBUSY;
+
+       /* The return trace stack is full */
+       if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+               atomic_inc(&current->trace_overrun);
+               return -EBUSY;
+       }
+
+       index = ++current->curr_ret_stack;
+       barrier();
+       current->ret_stack[index].ret = ret;
+       current->ret_stack[index].func = func;
+       current->ret_stack[index].calltime = time;
+       *depth = index;
+
+       return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+       int index;
+
+       index = current->curr_ret_stack;
+
+       if (unlikely(index < 0)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic, otherwise we have no where to go */
+               *ret = (unsigned long)panic;
+               return;
+       }
+
+       *ret = current->ret_stack[index].ret;
+       trace->func = current->ret_stack[index].func;
+       trace->calltime = current->ret_stack[index].calltime;
+       trace->overrun = atomic_read(&current->trace_overrun);
+       trace->depth = index;
+       barrier();
+       current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+       struct ftrace_graph_ret trace;
+       unsigned long ret;
+
+       pop_return_trace(&trace, &ret);
+       trace.rettime = cpu_clock(raw_smp_processor_id());
+       ftrace_graph_return(&trace);
+
+       if (unlikely(!ret)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic. What else to do? */
+               ret = (unsigned long)panic;
+       }
+
+       return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+       unsigned long old;
+       unsigned long long calltime;
+       int faulted;
+       struct ftrace_graph_ent trace;
+       unsigned long return_hooker = (unsigned long)
+                               &return_to_handler;
+
+       /* Nmi's are currently unsupported */
+       if (unlikely(atomic_read(&in_nmi)))
+               return;
+
+       if (unlikely(atomic_read(&current->tracing_graph_pause)))
+               return;
+
+       /*
+        * Protect against fault, even if it shouldn't
+        * happen. This tool is too much intrusive to
+        * ignore such a protection.
+        */
+       asm volatile(
+               "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+               "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+               "   movl $0, %[faulted]\n"
+
+               ".section .fixup, \"ax\"\n"
+               "3: movl $1, %[faulted]\n"
+               ".previous\n"
+
+               _ASM_EXTABLE(1b, 3b)
+               _ASM_EXTABLE(2b, 3b)
+
+               : [parent_replaced] "=r" (parent), [old] "=r" (old),
+                 [faulted] "=r" (faulted)
+               : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+               : "memory"
+       );
+
+       if (unlikely(faulted)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               return;
+       }
+
+       if (unlikely(!__kernel_text_address(old))) {
+               ftrace_graph_stop();
+               *parent = old;
+               WARN_ON(1);
+               return;
+       }
+
+       calltime = cpu_clock(raw_smp_processor_id());
+
+       if (push_return_trace(old, calltime,
+                               self_addr, &trace.depth) == -EBUSY) {
+               *parent = old;
+               return;
+       }
+
+       trace.func = self_addr;
+
+       /* Only trace if the calling function expects to */
+       if (!ftrace_graph_entry(&trace)) {
+               current->curr_ret_stack--;
+               *parent = old;
+       }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
index c622772..c27af49 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 
 unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
        if (hlt_use_halt()) {
+               struct power_trace it;
+
+               trace_power_start(&it, POWER_CSTATE, 1);
                current_thread_info()->status &= ~TS_POLLING;
                /*
                 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
                else
                        local_irq_enable();
                current_thread_info()->status |= TS_POLLING;
+               trace_power_end(&it);
        } else {
                local_irq_enable();
                /* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+       struct power_trace it;
+
+       trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
        if (!need_resched()) {
                __monitor((void *)&current_thread_info()->flags, 0, 0);
                smp_mb();
                if (!need_resched())
                        __mwait(ax, cx);
        }
+       trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+       struct power_trace it;
        if (!need_resched()) {
+               trace_power_start(&it, POWER_CSTATE, 1);
                __monitor((void *)&current_thread_info()->flags, 0, 0);
                smp_mb();
                if (!need_resched())
                        __sti_mwait(0, 0);
                else
                        local_irq_enable();
+               trace_power_end(&it);
        } else
                local_irq_enable();
 }
@@ -183,9 +195,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+       struct power_trace it;
+
+       trace_power_start(&it, POWER_CSTATE, 0);
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
+       trace_power_end(&it);
 }
 
 /*
index 0a1302f..24c2276 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
        struct thread_struct *prev = &prev_p->thread,
                                 *next = &next_p->thread;
index c958120..fbb321d 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
        struct thread_struct *prev = &prev_p->thread;
index 0a6d8c1..2c8ec1b 100644 (file)
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        size_t bts_index, bts_end;
        int error;
 
-       error = ds_get_bts_end(child, &bts_end);
+       error = ds_get_bts_end(child->bts, &bts_end);
        if (error < 0)
                return error;
 
        if (bts_end <= index)
                return -EINVAL;
 
-       error = ds_get_bts_index(child, &bts_index);
+       error = ds_get_bts_index(child->bts, &bts_index);
        if (error < 0)
                return error;
 
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        if (bts_end <= bts_index)
                bts_index -= bts_end;
 
-       error = ds_access_bts(child, bts_index, &bts_record);
+       error = ds_access_bts(child->bts, bts_index, &bts_record);
        if (error < 0)
                return error;
 
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
        size_t end, i;
        int error;
 
-       error = ds_get_bts_index(child, &end);
+       error = ds_get_bts_index(child->bts, &end);
        if (error < 0)
                return error;
 
        if (size < (end * sizeof(struct bts_struct)))
                return -EIO;
 
-       error = ds_access_bts(child, 0, (const void **)&raw);
+       error = ds_access_bts(child->bts, 0, (const void **)&raw);
        if (error < 0)
                return error;
 
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
                        return -EFAULT;
        }
 
-       error = ds_clear_bts(child);
+       error = ds_clear_bts(child->bts);
        if (error < 0)
                return error;
 
        return end;
 }
 
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
-       send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
 static int ptrace_bts_config(struct task_struct *child,
                             long cfg_size,
                             const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,
                goto errout;
 
        if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-               ds_ovfl_callback_t ovfl = NULL;
+               bts_ovfl_callback_t ovfl = NULL;
                unsigned int sig = 0;
 
-               /* we ignore the error in case we were not tracing child */
-               (void)ds_release_bts(child);
+               error = -EINVAL;
+               if (cfg.size < (10 * bts_cfg.sizeof_bts))
+                       goto errout;
 
                if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
                        if (!cfg.signal)
                                goto errout;
 
+                       error = -EOPNOTSUPP;
+                       goto errout;
+
                        sig  = cfg.signal;
-                       ovfl = ptrace_bts_ovfl;
                }
 
-               error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
-               if (error < 0)
+               if (child->bts) {
+                       (void)ds_release_bts(child->bts);
+                       kfree(child->bts_buffer);
+
+                       child->bts = NULL;
+                       child->bts_buffer = NULL;
+               }
+
+               error = -ENOMEM;
+               child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
+               if (!child->bts_buffer)
+                       goto errout;
+
+               child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
+                                           ovfl, /* th = */ (size_t)-1);
+               if (IS_ERR(child->bts)) {
+                       error = PTR_ERR(child->bts);
+                       kfree(child->bts_buffer);
+                       child->bts = NULL;
+                       child->bts_buffer = NULL;
                        goto errout;
+               }
 
                child->thread.bts_ovfl_signal = sig;
        }
@@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,
        if (cfg_size < sizeof(cfg))
                return -EIO;
 
-       error = ds_get_bts_end(child, &end);
+       error = ds_get_bts_end(child->bts, &end);
        if (error < 0)
                return error;
 
-       error = ds_access_bts(child, /* index = */ 0, &base);
+       error = ds_access_bts(child->bts, /* index = */ 0, &base);
        if (error < 0)
                return error;
 
-       error = ds_access_bts(child, /* index = */ end, &max);
+       error = ds_access_bts(child->bts, /* index = */ end, &max);
        if (error < 0)
                return error;
 
@@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
                return -EINVAL;
        }
 
-       /* The writing task will be the switched-to task on a context
-        * switch. It needs to write into the switched-from task's BTS
-        * buffer. */
-       return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+       return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
        switch (c->x86) {
        case 0x6:
                switch (c->x86_model) {
+               case 0 ... 0xC:
+                       /* sorry, don't know about them */
+                       break;
                case 0xD:
                case 0xE: /* Pentium M */
                        bts_configure(&bts_cfg_pentium_m);
                        break;
-               case 0xF: /* Core2 */
-        case 0x1C: /* Atom */
+               default: /* Core2, Atom, ... */
                        bts_configure(&bts_cfg_core2);
                        break;
-               default:
-                       /* sorry, don't know about them */
-                       break;
                }
                break;
        case 0xF:
@@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)
        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #ifdef CONFIG_X86_PTRACE_BTS
-       (void)ds_release_bts(child);
+       if (child->bts) {
+               (void)ds_release_bts(child->bts);
+               kfree(child->bts_buffer);
+               child->bts_buffer = NULL;
 
-       child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-       if (!child->thread.debugctlmsr)
-               clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+               child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+               if (!child->thread.debugctlmsr)
+                       clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
-       clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+               clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+       }
 #endif /* CONFIG_X86_PTRACE_BTS */
 }
 
@@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                        (child, data, (struct ptrace_bts_config __user *)addr);
                break;
 
-       case PTRACE_BTS_SIZE:
-               ret = ds_get_bts_index(child, /* pos = */ NULL);
+       case PTRACE_BTS_SIZE: {
+               size_t size;
+
+               ret = ds_get_bts_index(child->bts, &size);
+               if (ret == 0) {
+                       BUG_ON(size != (int) size);
+                       ret = (int) size;
+               }
                break;
+       }
 
        case PTRACE_BTS_GET:
                ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                break;
 
        case PTRACE_BTS_CLEAR:
-               ret = ds_clear_bts(child);
+               ret = ds_clear_bts(child->bts);
                break;
 
        case PTRACE_BTS_DRAIN:
index a03e7f6..10786af 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+       const void __user       *next_fp;
+       unsigned long           ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+               return 0;
+
+       ret = 1;
+       pagefault_disable();
+       if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+               ret = 0;
+       pagefault_enable();
+
+       return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+       const struct pt_regs *regs = task_pt_regs(current);
+       const void __user *fp = (const void __user *)regs->bp;
+
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = regs->ip;
+
+       while (trace->nr_entries < trace->max_entries) {
+               struct stack_frame frame;
+
+               frame.next_fp = NULL;
+               frame.ret_addr = 0;
+               if (!copy_stack_frame(fp, &frame))
+                       break;
+               if ((unsigned long)fp < regs->sp)
+                       break;
+               if (frame.ret_addr) {
+                       trace->entries[trace->nr_entries++] =
+                               frame.ret_addr;
+               }
+               if (fp == frame.next_fp)
+                       break;
+               fp = frame.next_fp;
+       }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+       /*
+        * Trace user stack if we are not a kernel thread
+        */
+       if (current->mm) {
+               __save_stack_trace_user(trace);
+       }
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
index 0b8b669..6f3d3d4 100644 (file)
@@ -17,6 +17,9 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
index 9e68075..4a20b2f 100644 (file)
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 #define __do_strncpy_from_user(dst, src, count, res)                      \
 do {                                                                      \
        int __d0, __d1, __d2;                                              \
-       might_sleep();                                                     \
+       might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testl %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 #define __do_clear_user(addr,size)                                     \
 do {                                                                   \
        int __d0;                                                       \
-       might_sleep();                                                  \
+       might_fault();                                                  \
        __asm__ __volatile__(                                           \
                "0:     rep; stosl\n"                                   \
                "       movl %2,%0\n"                                   \
@@ -155,7 +155,7 @@ do {                                                                        \
 unsigned long
 clear_user(void __user *to, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (access_ok(VERIFY_WRITE, to, n))
                __do_clear_user(to, n);
        return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
        unsigned long mask = -__addr_ok(s);
        unsigned long res, tmp;
 
-       might_sleep();
+       might_fault();
 
        __asm__ __volatile__(
                "       testl %0, %0\n"
index f4df6e7..64d6c84 100644 (file)
@@ -15,7 +15,7 @@
 #define __do_strncpy_from_user(dst,src,count,res)                         \
 do {                                                                      \
        long __d0, __d1, __d2;                                             \
-       might_sleep();                                                     \
+       might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testq %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
        long __d0;
-       might_sleep();
+       might_fault();
        /* no memory constraint because it doesn't change any memory gcc knows
           about */
        asm volatile(
index fea4565..d8cc96a 100644 (file)
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP)        += dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)          += highmem_32.o
 
-obj-$(CONFIG_MMIOTRACE_HOOKS)  += kmmio.o
 obj-$(CONFIG_MMIOTRACE)                += mmiotrace.o
-mmiotrace-y                    := pf_in.o mmio-mod.o
+mmiotrace-y                    := kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)   += testmmiotrace.o
 
 obj-$(CONFIG_NUMA)             += numa_$(BITS).o
index 31e8730..21e996a 100644 (file)
@@ -53,7 +53,7 @@
 
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
@@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
                                 unsigned long error_code)
 {
        unsigned long flags = oops_begin();
+       int sig = SIGKILL;
        struct task_struct *tsk;
 
        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
        tsk->thread.trap_no = 14;
        tsk->thread.error_code = error_code;
        if (__die("Bad pagetable", regs, error_code))
-               regs = NULL;
-       oops_end(flags, regs, SIGKILL);
+               sig = 0;
+       oops_end(flags, regs, sig);
 }
 #endif
 
@@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        int fault;
 #ifdef CONFIG_X86_64
        unsigned long flags;
+       int sig;
 #endif
 
        tsk = current;
@@ -849,11 +851,12 @@ no_context:
        bust_spinlocks(0);
        do_exit(SIGKILL);
 #else
+       sig = SIGKILL;
        if (__die("Oops", regs, error_code))
-               regs = NULL;
+               sig = 0;
        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags, regs, SIGKILL);
+       oops_end(flags, regs, sig);
 #endif
 
 /*
index 1ef0f90..d9d3582 100644 (file)
@@ -9,6 +9,9 @@
  * Also alternative() doesn't work.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
 #include <linux/time.h>
index 1ab7c15..290b219 100644 (file)
@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
        depends on SYSFS
        select RELAY
        select DEBUG_FS
+       select TRACEPOINTS
        help
          Say Y here if you want to be able to trace the block layer actions
          on a given queue. Tracing allows you to see any traffic happening
index c36aa98..561e8a1 100644 (file)
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
+#include <trace/block.h>
 
 #include "blk.h"
 
+DEFINE_TRACE(block_plug);
+DEFINE_TRACE(block_unplug_io);
+DEFINE_TRACE(block_unplug_timer);
+DEFINE_TRACE(block_getrq);
+DEFINE_TRACE(block_sleeprq);
+DEFINE_TRACE(block_rq_requeue);
+DEFINE_TRACE(block_bio_backmerge);
+DEFINE_TRACE(block_bio_frontmerge);
+DEFINE_TRACE(block_bio_queue);
+DEFINE_TRACE(block_rq_complete);
+DEFINE_TRACE(block_remap);     /* Also used in drivers/md/dm.c */
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+
 static int __make_request(struct request_queue *q, struct bio *bio);
 
 /*
@@ -205,7 +219,7 @@ void blk_plug_device(struct request_queue *q)
 
        if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
                mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-               blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+               trace_block_plug(q);
        }
 }
 EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +306,7 @@ void blk_unplug_work(struct work_struct *work)
        struct request_queue *q =
                container_of(work, struct request_queue, unplug_work);
 
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
+       trace_block_unplug_io(q);
        q->unplug_fn(q);
 }
 
@@ -302,9 +314,7 @@ void blk_unplug_timeout(unsigned long data)
 {
        struct request_queue *q = (struct request_queue *)data;
 
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
+       trace_block_unplug_timer(q);
        kblockd_schedule_work(q, &q->unplug_work);
 }
 
@@ -314,9 +324,7 @@ void blk_unplug(struct request_queue *q)
         * devices don't necessarily have an ->unplug_fn defined
         */
        if (q->unplug_fn) {
-               blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                                       q->rq.count[READ] + q->rq.count[WRITE]);
-
+               trace_block_unplug_io(q);
                q->unplug_fn(q);
        }
 }
@@ -822,7 +830,7 @@ rq_starved:
        if (ioc_batching(q, ioc))
                ioc->nr_batch_requests--;
 
-       blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+       trace_block_getrq(q, bio, rw);
 out:
        return rq;
 }
@@ -848,7 +856,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                prepare_to_wait_exclusive(&rl->wait[rw], &wait,
                                TASK_UNINTERRUPTIBLE);
 
-               blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+               trace_block_sleeprq(q, bio, rw);
 
                __generic_unplug_device(q);
                spin_unlock_irq(q->queue_lock);
@@ -928,7 +936,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
        blk_delete_timer(rq);
        blk_clear_rq_complete(rq);
-       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+       trace_block_rq_requeue(q, rq);
 
        if (blk_rq_tagged(rq))
                blk_queue_end_tag(q, rq);
@@ -1167,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!ll_back_merge_fn(q, req, bio))
                        break;
 
-               blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+               trace_block_bio_backmerge(q, bio);
 
                req->biotail->bi_next = bio;
                req->biotail = bio;
@@ -1186,7 +1194,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!ll_front_merge_fn(q, req, bio))
                        break;
 
-               blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+               trace_block_bio_frontmerge(q, bio);
 
                bio->bi_next = req->bio;
                req->bio = bio;
@@ -1269,7 +1277,7 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
 
-               blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+               trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
                                    bdev->bd_dev, bio->bi_sector,
                                    bio->bi_sector - p->start_sect);
        }
@@ -1441,10 +1449,10 @@ end_io:
                        goto end_io;
 
                if (old_sector != -1)
-                       blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+                       trace_block_remap(q, bio, old_dev, bio->bi_sector,
                                            old_sector);
 
-               blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+               trace_block_bio_queue(q, bio);
 
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@ -1678,7 +1686,7 @@ static int __end_that_request_first(struct request *req, int error,
        int total_bytes, bio_nbytes, next_idx = 0;
        struct bio *bio;
 
-       blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+       trace_block_rq_complete(req->q, req);
 
        /*
         * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
index 85049a7..b0a2cae 100644 (file)
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <trace/block.h>
 #include <asm/uaccess.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
 /*
  * Send out a notify message.
  */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
-void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
        struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        local_irq_restore(flags);
 }
 
-EXPORT_SYMBOL_GPL(__blk_add_trace);
-
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
        free_percpu(bt->sequence);
        free_percpu(bt->msg_data);
        kfree(bt);
+       mutex_lock(&blk_probe_mutex);
+       if (atomic_dec_and_test(&blk_probes_ref))
+               blk_unregister_tracepoints();
+       mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        bt->pid = buts->pid;
        bt->trace_state = Blktrace_setup;
 
+       mutex_lock(&blk_probe_mutex);
+       if (atomic_add_return(1, &blk_probes_ref) == 1) {
+               ret = blk_register_tracepoints();
+               if (ret)
+                       goto probe_err;
+       }
+       mutex_unlock(&blk_probe_mutex);
+
        ret = -EBUSY;
        old_bt = xchg(&q->blk_trace, bt);
        if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        }
 
        return 0;
+probe_err:
+       atomic_dec(&blk_probes_ref);
+       mutex_unlock(&blk_probe_mutex);
 err:
        if (dir)
                blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
                blk_trace_remove(q);
        }
 }
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:         queue the io is for
+ * @rq:                the source request
+ * @what:      the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+                                   u32 what)
+{
+       struct blk_trace *bt = q->blk_trace;
+       int rw = rq->cmd_flags & 0x03;
+
+       if (likely(!bt))
+               return;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               what |= BLK_TC_ACT(BLK_TC_PC);
+               __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+                               sizeof(rq->cmd), rq->cmd);
+       } else  {
+               what |= BLK_TC_ACT(BLK_TC_FS);
+               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                               rw, what, rq->errors, 0, NULL);
+       }
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:         queue the io is for
+ * @bio:       the source bio
+ * @what:      the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+                                    u32 what)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (likely(!bt))
+               return;
+
+       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+                       !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio)
+               blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+       else {
+               struct blk_trace *bt = q->blk_trace;
+
+               if (bt)
+                       __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+       }
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio)
+               blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+       else {
+               struct blk_trace *bt = q->blk_trace;
+
+               if (bt)
+                       __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
+       }
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt)
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+                               unsigned int pdu)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+                               BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:         queue the io is for
+ * @bio:       the source bio
+ * @dev:       target device
+ * @from:      source sector
+ * @to:                target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+                                      dev_t dev, sector_t from, sector_t to)
+{
+       struct blk_trace *bt = q->blk_trace;
+       struct blk_io_trace_remap r;
+
+       if (likely(!bt))
+               return;
+
+       r.device = cpu_to_be32(dev);
+       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+       r.sector = cpu_to_be64(to);
+
+       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+                       !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:         queue the io is for
+ * @rq:                io request
+ * @data:      driver-specific data
+ * @len:       length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+                        struct request *rq,
+                        void *data, size_t len)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (likely(!bt))
+               return;
+
+       if (blk_pc_request(rq))
+               __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+                               rq->errors, len, data);
+       else
+               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                               0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+       int ret;
+
+       ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+       WARN_ON(ret);
+       ret = register_trace_block_getrq(blk_add_trace_getrq);
+       WARN_ON(ret);
+       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+       WARN_ON(ret);
+       ret = register_trace_block_plug(blk_add_trace_plug);
+       WARN_ON(ret);
+       ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+       WARN_ON(ret);
+       ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+       WARN_ON(ret);
+       ret = register_trace_block_split(blk_add_trace_split);
+       WARN_ON(ret);
+       ret = register_trace_block_remap(blk_add_trace_remap);
+       WARN_ON(ret);
+       return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+       unregister_trace_block_remap(blk_add_trace_remap);
+       unregister_trace_block_split(blk_add_trace_split);
+       unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+       unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+       unregister_trace_block_plug(blk_add_trace_plug);
+       unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+       unregister_trace_block_getrq(blk_add_trace_getrq);
+       unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+       unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+       unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+       unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+       unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+       unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+       unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+       unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+       unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+       unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+       tracepoint_synchronize_unregister();
+}
index a6951f7..86836dd 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
@@ -41,6 +42,8 @@
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
+DEFINE_TRACE(block_rq_abort);
+
 /*
  * Merge hash stuff.
  */
@@ -52,6 +55,9 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)                (!hlist_unhashed(&(rq)->hash))
 
+DEFINE_TRACE(block_rq_insert);
+DEFINE_TRACE(block_rq_issue);
+
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
@@ -586,7 +592,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
        unsigned ordseq;
        int unplug_it = 1;
 
-       blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+       trace_block_rq_insert(q, rq);
 
        rq->q = q;
 
@@ -772,7 +778,7 @@ struct request *elv_next_request(struct request_queue *q)
                         * not be passed by new incoming requests
                         */
                        rq->cmd_flags |= REQ_STARTED;
-                       blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+                       trace_block_rq_issue(q, rq);
                }
 
                if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -914,7 +920,7 @@ void elv_abort_queue(struct request_queue *q)
        while (!list_empty(&q->queue_head)) {
                rq = list_entry_rq(q->queue_head.next);
                rq->cmd_flags |= REQ_QUIET;
-               blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+               trace_block_rq_abort(q, rq);
                __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
        }
 }
index ce0d9da..94966ed 100644 (file)
@@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
        .enable_mask    = SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_TRACING
+#include <linux/ftrace.h>
+
+static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
+{
+       ftrace_dump();
+}
+static struct sysrq_key_op sysrq_ftrace_dump_op = {
+       .handler        = sysrq_ftrace_dump,
+       .help_msg       = "dumpZ-ftrace-buffer",
+       .action_msg     = "Dump ftrace buffer",
+       .enable_mask    = SYSRQ_ENABLE_DUMP,
+};
+#else
+#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
+#endif
 
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 {
@@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
        NULL,                           /* x */
        /* y: May be registered on sparc64 for global register dump */
        NULL,                           /* y */
-       NULL                            /* z */
+       &sysrq_ftrace_dump_op,          /* z */
 };
 
 /* key2index calculation, -1 on invalid index */
index c99e472..343094c 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -51,6 +52,8 @@ struct dm_target_io {
        union map_info info;
 };
 
+DEFINE_TRACE(block_bio_complete);
+
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
        if (bio && bio->bi_private)
@@ -504,8 +507,7 @@ static void dec_pending(struct dm_io *io, int error)
                end_io_acct(io);
 
                if (io->error != DM_ENDIO_REQUEUE) {
-                       blk_add_trace_bio(io->md->queue, io->bio,
-                                         BLK_TA_COMPLETE);
+                       trace_block_bio_complete(io->md->queue, io->bio);
 
                        bio_endio(io->bio, io->error);
                }
@@ -598,7 +600,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
 
-               blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+               trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
                                    tio->io->bio->bi_bdev->bd_dev,
                                    clone->bi_sector, sector);
 
index 77a55bc..df99c88 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <scsi/sg.h>           /* for struct sg_iovec */
 
+DEFINE_TRACE(block_split);
+
 static struct kmem_cache *bio_slab __read_mostly;
 
 static mempool_t *bio_split_pool __read_mostly;
@@ -1263,7 +1266,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        if (!bp)
                return bp;
 
-       blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+       trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
 
        BUG_ON(bi->bi_vcnt != 1);
index eba2eab..16c2115 100644 (file)
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
 
-static char *mangle_path(char *s, char *p, char *esc)
+/**
+ *     mangle_path -   mangle and copy path to buffer beginning
+ *     @s: buffer start
+ *     @p: beginning of path in above buffer
+ *     @esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
 {
        while (s <= p) {
                char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
        }
        return NULL;
 }
+EXPORT_SYMBOL(mangle_path);
 
 /*
  * return the absolute path of 'dentry' residing in mount 'mnt'.
index 8074460..eba835a 100644 (file)
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_BRANCH_PROFILING
+#define LIKELY_PROFILE()       VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
+                               *(_ftrace_annotated_branch)                           \
+                               VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+#define BRANCH_PROFILE()       VMLINUX_SYMBOL(__start_branch_profile) = .;   \
+                               *(_ftrace_branch)                             \
+                               VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#else
+#define BRANCH_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA                                                      \
        *(.data)                                                        \
        VMLINUX_SYMBOL(__start___markers) = .;                          \
        *(__markers)                                                    \
        VMLINUX_SYMBOL(__stop___markers) = .;                           \
+       . = ALIGN(32);                                                  \
        VMLINUX_SYMBOL(__start___tracepoints) = .;                      \
        *(__tracepoints)                                                \
-       VMLINUX_SYMBOL(__stop___tracepoints) = .;
+       VMLINUX_SYMBOL(__stop___tracepoints) = .;                       \
+       LIKELY_PROFILE()                                                \
+       BRANCH_PROFILE()
 
 #define RO_DATA(align)                                                 \
        . = ALIGN((align));                                             \
index bdf505d..1dba349 100644 (file)
@@ -160,7 +160,6 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 extern int do_blk_trace_setup(struct request_queue *q,
        char *name, dev_t dev, struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
@@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
        } while (0)
 #define BLK_TN_MAX_MSG         128
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:         queue the io is for
- * @rq:                the source request
- * @what:      the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-                                   u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-       int rw = rq->cmd_flags & 0x03;
-
-       if (likely(!bt))
-               return;
-
-       if (blk_discard_rq(rq))
-               rw |= (1 << BIO_RW_DISCARD);
-
-       if (blk_pc_request(rq)) {
-               what |= BLK_TC_ACT(BLK_TC_PC);
-               __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-       } else  {
-               what |= BLK_TC_ACT(BLK_TC_FS);
-               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-       }
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:         queue the io is for
- * @bio:       the source bio
- * @what:      the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-                                    u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:         queue the io is for
- * @bio:       the source bio
- * @rw:                the data direction
- * @what:      the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-                                        struct bio *bio, int rw, u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       if (bio)
-               blk_add_trace_bio(q, bio, what);
-       else
-               __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:         queue the io is for
- * @what:      the action
- * @bio:       the source bio
- * @pdu:       the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-                                        struct bio *bio, unsigned int pdu)
-{
-       struct blk_trace *bt = q->blk_trace;
-       __be64 rpdu = cpu_to_be64(pdu);
-
-       if (likely(!bt))
-               return;
-
-       if (bio)
-               __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-       else
-               __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:         queue the io is for
- * @bio:       the source bio
- * @dev:       target device
- * @from:      source sector
- * @to:                target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                      dev_t dev, sector_t from, sector_t to)
-{
-       struct blk_trace *bt = q->blk_trace;
-       struct blk_io_trace_remap r;
-
-       if (likely(!bt))
-               return;
-
-       r.device = cpu_to_be32(dev);
-       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-       r.sector = cpu_to_be64(to);
-
-       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:         queue the io is for
- * @rq:                io request
- * @data:      driver-specific data
- * @len:       length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-static inline void blk_add_driver_data(struct request_queue *q,
-                                      struct request *rq,
-                                      void *data, size_t len)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       if (blk_pc_request(rq))
-               __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-                               rq->errors, len, data);
-       else
-               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-                               0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-
+extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
+                               void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                           char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q);
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)                (-ENOTTY)
 #define blk_trace_shutdown(q)                  do { } while (0)
-#define blk_add_trace_rq(q, rq, what)          do { } while (0)
-#define blk_add_trace_bio(q, rq, what)         do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)       do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
-#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY)
+#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
 #define blk_trace_setup(q, name, dev, arg)     (-ENOTTY)
 #define blk_trace_startstop(q, start)          (-ENOTTY)
 #define blk_trace_remove(q)                    (-ENOTTY)
index 98115d9..ea7c6be 100644 (file)
@@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-#define likely(x)      __builtin_expect(!!(x), 1)
-#define unlikely(x)    __builtin_expect(!!(x), 0)
+struct ftrace_branch_data {
+       const char *func;
+       const char *file;
+       unsigned line;
+       union {
+               struct {
+                       unsigned long correct;
+                       unsigned long incorrect;
+               };
+               struct {
+                       unsigned long miss;
+                       unsigned long hit;
+               };
+       };
+};
+
+/*
+ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+
+#define likely_notrace(x)      __builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)    __builtin_expect(!!(x), 0)
+
+#define __branch_check__(x, expect) ({                                 \
+                       int ______r;                                    \
+                       static struct ftrace_branch_data                \
+                               __attribute__((__aligned__(4)))         \
+                               __attribute__((section("_ftrace_annotated_branch"))) \
+                               ______f = {                             \
+                               .func = __func__,                       \
+                               .file = __FILE__,                       \
+                               .line = __LINE__,                       \
+                       };                                              \
+                       ______r = likely_notrace(x);                    \
+                       ftrace_likely_update(&______f, ______r, expect); \
+                       ______r;                                        \
+               })
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)    (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+# endif
+# ifndef unlikely
+#  define unlikely(x)  (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+# endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+/*
+ * "Define 'is'", Bill Clinton
+ * "Define 'if'", Steven Rostedt
+ */
+#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :         \
+       ({                                                              \
+               int ______r;                                            \
+               static struct ftrace_branch_data                        \
+                       __attribute__((__aligned__(4)))                 \
+                       __attribute__((section("_ftrace_branch")))      \
+                       ______f = {                                     \
+                               .func = __func__,                       \
+                               .file = __FILE__,                       \
+                               .line = __LINE__,                       \
+                       };                                              \
+               ______r = !!(cond);                                     \
+               if (______r)                                            \
+                       ______f.hit++;                                  \
+               else                                                    \
+                       ______f.miss++;                                 \
+               ______r;                                                \
+       }))
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+#else
+# define likely(x)     __builtin_expect(!!(x), 1)
+# define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
index 4aaa4af..096476f 100644 (file)
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
 ({                                                                     \
        int __ret = 0;                                                  \
                                                                        \
-       if (unlikely(c)) {                                              \
+       if (!oops_in_progress && unlikely(c)) {                         \
                if (debug_locks_off() && !debug_locks_silent)           \
                        WARN_ON(1);                                     \
                __ret = 1;                                              \
index 9c5bc6b..985b28d 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -24,6 +26,45 @@ struct ftrace_ops {
        struct ftrace_ops *next;
 };
 
+extern int function_trace_stop;
+
+/*
+ * Type of the current tracing.
+ */
+enum ftrace_tracing_type_t {
+       FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
+       FTRACE_TYPE_RETURN,     /* Hook the return of the function */
+};
+
+/* Current tracing type, default is FTRACE_TYPE_ENTER */
+extern enum ftrace_tracing_type_t ftrace_tracing_type;
+
+/**
+ * ftrace_stop - stop function tracer.
+ *
+ * A quick way to stop the function tracer. Note this an on off switch,
+ * it is not something that is recursive like preempt_disable.
+ * This does not disable the calling of mcount, it only stops the
+ * calling of functions from mcount.
+ */
+static inline void ftrace_stop(void)
+{
+       function_trace_stop = 1;
+}
+
+/**
+ * ftrace_start - start the function tracer.
+ *
+ * This function is the inverse of ftrace_stop. This does not enable
+ * the function tracing if the function tracer is disabled. This only
+ * sets the function tracer flag to continue calling the functions
+ * from mcount.
+ */
+static inline void ftrace_start(void)
+{
+       function_trace_stop = 0;
+}
+
 /*
  * The ftrace_ops must be a static and should also
  * be read_mostly.  These functions do modify read_mostly variables
@@ -42,9 +83,13 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill(void) { }
+static inline void ftrace_stop(void) { }
+static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
+#include <asm/ftrace.h>
 
 enum {
        FTRACE_FL_FREE          = (1 << 0),
@@ -60,6 +105,7 @@ struct dyn_ftrace {
        struct list_head        list;
        unsigned long           ip; /* address of mcount call-site */
        unsigned long           flags;
+       struct dyn_arch_ftrace  arch;
 };
 
 int ftrace_force_update(void);
@@ -67,19 +113,25 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
-extern unsigned char *ftrace_nop_replace(void);
-extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern void ftrace_graph_caller(void);
+extern int ftrace_enable_ftrace_graph_caller(void);
+extern int ftrace_disable_ftrace_graph_caller(void);
+#else
+static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
+static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
+#endif
 
 /**
- * ftrace_modify_code - modify code segment
- * @ip: the address of the code segment
- * @old_code: the contents of what is expected to be there
- * @new_code: the code to patch in
+ * ftrace_make_nop - convert code into top
+ * @mod: module structure if called by module load initialization
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should be calling
  *
  * This is a very sensitive operation and great care needs
  * to be taken by the arch.  The operation should carefully
@@ -87,6 +139,8 @@ extern void mcount_call(void);
  * what we expect it to be, and then on success of the compare,
  * it should write to the location.
  *
+ * The code segment at @rec->ip should be a caller to @addr
+ *
  * Return must be:
  *  0 on success
  *  -EFAULT on error reading the location
@@ -94,8 +148,34 @@ extern void mcount_call(void);
  *  -EPERM  on error writing to the location
  * Any other value will be considered a failure.
  */
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-                             unsigned char *new_code);
+extern int ftrace_make_nop(struct module *mod,
+                          struct dyn_ftrace *rec, unsigned long addr);
+
+/**
+ * ftrace_make_call - convert a nop call site into a call to addr
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should call
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a nop
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
+
+
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
@@ -103,7 +183,6 @@ extern void ftrace_release(void *start, unsigned long size);
 
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
-
 #else
 # define skip_trace(ip)                                ({ 0; })
 # define ftrace_force_update()                 ({ 0; })
@@ -182,6 +261,12 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 
 #ifdef CONFIG_TRACING
+extern int ftrace_dump_on_oops;
+
+extern void tracing_start(void);
+extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
@@ -212,6 +297,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
@@ -222,33 +310,167 @@ static inline void ftrace_dump(void) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_init_module(struct module *mod,
+                              unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
 static inline void
-ftrace_init_module(unsigned long *start, unsigned long *end) { }
+ftrace_init_module(struct module *mod,
+                  unsigned long *start, unsigned long *end) { }
 #endif
 
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+       ktime_t                 stamp;
+       ktime_t                 end;
+       int                     type;
+       int                     state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+                                       unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+                                       unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+                                       unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+                                       unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+
+/*
+ * Structure that defines an entry function trace.
+ */
+struct ftrace_graph_ent {
+       unsigned long func; /* Current function */
+       int depth;
+};
 
-struct boot_trace {
-       pid_t                   caller;
-       char                    func[KSYM_SYMBOL_LEN];
-       int                     result;
-       unsigned long long      duration;               /* usecs */
-       ktime_t                 calltime;
-       ktime_t                 rettime;
+/*
+ * Structure that defines a return function trace.
+ */
+struct ftrace_graph_ret {
+       unsigned long func; /* Current function */
+       unsigned long long calltime;
+       unsigned long long rettime;
+       /* Number of functions that overran the depth limit for current task */
+       unsigned long overrun;
+       int depth;
 };
 
-#ifdef CONFIG_BOOT_TRACER
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
-extern void start_boot_trace(void);
-extern void stop_boot_trace(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/*
+ * Sometimes we don't want to trace a function with the function
+ * graph tracer but we want them to keep traced by the usual function
+ * tracer if the function graph tracer is not configured.
+ */
+#define __notrace_funcgraph            notrace
+
+#define FTRACE_RETFUNC_DEPTH 50
+#define FTRACE_RETSTACK_ALLOC_SIZE 32
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
+extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                               trace_func_graph_ent_t entryfunc);
+
+extern void ftrace_graph_stop(void);
+
+/* The current handlers in use */
+extern trace_func_graph_ret_t ftrace_graph_return;
+extern trace_func_graph_ent_t ftrace_graph_entry;
+
+extern void unregister_ftrace_graph(void);
+
+extern void ftrace_graph_init_task(struct task_struct *t);
+extern void ftrace_graph_exit_task(struct task_struct *t);
+
+static inline int task_curr_ret_stack(struct task_struct *t)
+{
+       return t->curr_ret_stack;
+}
+
+static inline void pause_graph_tracing(void)
+{
+       atomic_inc(&current->tracing_graph_pause);
+}
+
+static inline void unpause_graph_tracing(void)
+{
+       atomic_dec(&current->tracing_graph_pause);
+}
 #else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
-static inline void start_boot_trace(void) { }
-static inline void stop_boot_trace(void) { }
+
+#define __notrace_funcgraph
+
+static inline void ftrace_graph_init_task(struct task_struct *t) { }
+static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+
+static inline int task_curr_ret_stack(struct task_struct *tsk)
+{
+       return -1;
+}
+
+static inline void pause_graph_tracing(void) { }
+static inline void unpause_graph_tracing(void) { }
 #endif
 
+#ifdef CONFIG_TRACING
+#include <linux/sched.h>
+
+/* flags for current->trace */
+enum {
+       TSK_TRACE_FL_TRACE_BIT  = 0,
+       TSK_TRACE_FL_GRAPH_BIT  = 1,
+};
+enum {
+       TSK_TRACE_FL_TRACE      = 1 << TSK_TRACE_FL_TRACE_BIT,
+       TSK_TRACE_FL_GRAPH      = 1 << TSK_TRACE_FL_GRAPH_BIT,
+};
+
+static inline void set_tsk_trace_trace(struct task_struct *tsk)
+{
+       set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_trace(struct task_struct *tsk)
+{
+       clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_trace(struct task_struct *tsk)
+{
+       return tsk->trace & TSK_TRACE_FL_TRACE;
+}
+
+static inline void set_tsk_trace_graph(struct task_struct *tsk)
+{
+       set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_graph(struct task_struct *tsk)
+{
+       clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_graph(struct task_struct *tsk)
+{
+       return tsk->trace & TSK_TRACE_FL_GRAPH;
+}
 
+#endif /* CONFIG_TRACING */
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
new file mode 100644 (file)
index 0000000..366a054
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef _LINUX_FTRACE_IRQ_H
+#define _LINUX_FTRACE_IRQ_H
+
+
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
+#else
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
+#endif
+
+#endif /* _LINUX_FTRACE_IRQ_H */
index 586ab56..8f627b9 100644 (file)
@@ -164,6 +164,8 @@ union futex_key {
        } both;
 };
 
+#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
+
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
index 181006c..89a56d7 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
+#include <linux/ftrace_irq.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
 
@@ -161,7 +162,17 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()            do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()             do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()                            \
+       do {                                    \
+               ftrace_nmi_enter();             \
+               lockdep_off();                  \
+               __irq_enter();                  \
+       } while (0)
+#define nmi_exit()                             \
+       do {                                    \
+               __irq_exit();                   \
+               lockdep_on();                   \
+               ftrace_nmi_exit();              \
+       } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
index dc7e0d0..269df5a 100644 (file)
@@ -141,6 +141,15 @@ extern int _cond_resched(void);
                (__x < 0) ? -__x : __x;         \
        })
 
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void);
+#else
+static inline void might_fault(void)
+{
+       might_sleep();
+}
+#endif
+
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
@@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr);
 extern int core_kernel_text(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
+extern int func_ptr_is_kernel_text(void *ptr);
+
 struct pid;
 extern struct pid *session_of_pgrp(struct pid *pgrp);
 
index 29aec6e..8956daf 100644 (file)
@@ -73,6 +73,8 @@ struct lock_class_key {
        struct lockdep_subclass_key     subkeys[MAX_LOCKDEP_SUBCLASSES];
 };
 
+#define LOCKSTAT_POINTS                4
+
 /*
  * The lock-class itself:
  */
@@ -119,7 +121,8 @@ struct lock_class {
        int                             name_version;
 
 #ifdef CONFIG_LOCK_STAT
-       unsigned long                   contention_point[4];
+       unsigned long                   contention_point[LOCKSTAT_POINTS];
+       unsigned long                   contending_point[LOCKSTAT_POINTS];
 #endif
 };
 
@@ -144,6 +147,7 @@ enum bounce_type {
 
 struct lock_class_stats {
        unsigned long                   contention_point[4];
+       unsigned long                   contending_point[4];
        struct lock_time                read_waittime;
        struct lock_time                write_waittime;
        struct lock_time                read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
        const char                      *name;
 #ifdef CONFIG_LOCK_STAT
        int                             cpu;
+       unsigned long                   ip;
 #endif
 };
 
@@ -356,7 +361,7 @@ struct lock_class_key { };
 #ifdef CONFIG_LOCK_STAT
 
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
-extern void lock_acquired(struct lockdep_map *lock);
+extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
 
 #define LOCK_CONTENDED(_lock, try, lock)                       \
 do {                                                           \
@@ -364,13 +369,13 @@ do {                                                              \
                lock_contended(&(_lock)->dep_map, _RET_IP_);    \
                lock(_lock);                                    \
        }                                                       \
-       lock_acquired(&(_lock)->dep_map);                       \
+       lock_acquired(&(_lock)->dep_map, _RET_IP_);                     \
 } while (0)
 
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
-#define lock_acquired(lockdep_map) do {} while (0)
+#define lock_acquired(lockdep_map, ip) do {} while (0)
 
 #define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)
@@ -481,4 +486,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 # define lock_map_release(l)                   do { } while (0)
 #endif
 
+#ifdef CONFIG_PROVE_LOCKING
+# define might_lock(lock)                                              \
+do {                                                                   \
+       typecheck(struct lockdep_map *, &(lock)->dep_map);              \
+       lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_);    \
+       lock_release(&(lock)->dep_map, 0, _THIS_IP_);                   \
+} while (0)
+# define might_lock_read(lock)                                                 \
+do {                                                                   \
+       typecheck(struct lockdep_map *, &(lock)->dep_map);              \
+       lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_);    \
+       lock_release(&(lock)->dep_map, 0, _THIS_IP_);                   \
+} while (0)
+#else
+# define might_lock(lock) do { } while (0)
+# define might_lock_read(lock) do { } while (0)
+#endif
+
 #endif /* __LINUX_LOCKDEP_H */
index 889196c..b85e74c 100644 (file)
@@ -12,6 +12,7 @@
  * See the file COPYING for more details.
  */
 
+#include <stdarg.h>
 #include <linux/types.h>
 
 struct module;
@@ -48,10 +49,28 @@ struct marker {
        void (*call)(const struct marker *mdata, void *call_private, ...);
        struct marker_probe_closure single;
        struct marker_probe_closure *multi;
+       const char *tp_name;    /* Optional tracepoint name */
+       void *tp_cb;            /* Optional tracepoint callback */
 } __attribute__((aligned(8)));
 
 #ifdef CONFIG_MARKERS
 
+#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format)               \
+               static const char __mstrtab_##name[]                    \
+               __attribute__((section("__markers_strings")))           \
+               = #name "\0" format;                                    \
+               static struct marker __mark_##name                      \
+               __attribute__((section("__markers"), aligned(8))) =     \
+               { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
+                 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
+                 NULL, tp_name_str, tp_cb }
+
+#define DEFINE_MARKER(name, format)                                    \
+               _DEFINE_MARKER(name, NULL, NULL, format)
+
+#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format)                 \
+               _DEFINE_MARKER(name, #tp_name, tp_cb, format)
+
 /*
  * Note : the empty asm volatile with read constraint is used here instead of a
  * "used" attribute to fix a gcc 4.1.x bug.
@@ -65,14 +84,7 @@ struct marker {
  */
 #define __trace_mark(generic, name, call_private, format, args...)     \
        do {                                                            \
-               static const char __mstrtab_##name[]                    \
-               __attribute__((section("__markers_strings")))           \
-               = #name "\0" format;                                    \
-               static struct marker __mark_##name                      \
-               __attribute__((section("__markers"), aligned(8))) =     \
-               { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
-               0, 0, marker_probe_cb,                                  \
-               { __mark_empty_function, NULL}, NULL };                 \
+               DEFINE_MARKER(name, format);                            \
                __mark_check_format(format, ## args);                   \
                if (unlikely(__mark_##name.state)) {                    \
                        (*__mark_##name.call)                           \
@@ -80,14 +92,39 @@ struct marker {
                }                                                       \
        } while (0)
 
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+       do {                                                            \
+               void __check_tp_type(void)                              \
+               {                                                       \
+                       register_trace_##tp_name(tp_cb);                \
+               }                                                       \
+               DEFINE_MARKER_TP(name, tp_name, tp_cb, format);         \
+               __mark_check_format(format, ## args);                   \
+               (*__mark_##name.call)(&__mark_##name, call_private,     \
+                                       ## args);                       \
+       } while (0)
+
 extern void marker_update_probe_range(struct marker *begin,
        struct marker *end);
+
+#define GET_MARKER(name)       (__mark_##name)
+
 #else /* !CONFIG_MARKERS */
+#define DEFINE_MARKER(name, tp_name, tp_cb, format)
 #define __trace_mark(generic, name, call_private, format, args...) \
                __mark_check_format(format, ## args)
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+       do {                                                            \
+               void __check_tp_type(void)                              \
+               {                                                       \
+                       register_trace_##tp_name(tp_cb);                \
+               }                                                       \
+               __mark_check_format(format, ## args);                   \
+       } while (0)
 static inline void marker_update_probe_range(struct marker *begin,
        struct marker *end)
 { }
+#define GET_MARKER(name)
 #endif /* CONFIG_MARKERS */
 
 /**
@@ -117,6 +154,20 @@ static inline void marker_update_probe_range(struct marker *begin,
        __trace_mark(1, name, NULL, format, ## args)
 
 /**
+ * trace_mark_tp - Marker in a tracepoint callback
+ * @name: marker name, not quoted.
+ * @tp_name: tracepoint name, not quoted.
+ * @tp_cb: tracepoint callback. Should have an associated global symbol so it
+ *         is not optimized away by the compiler (should not be static).
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker in a tracepoint callback.
+ */
+#define trace_mark_tp(name, tp_name, tp_cb, format, args...)   \
+       __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
+
+/**
  * MARK_NOARGS - Format string for a marker with no argument.
  */
 #define MARK_NOARGS " "
@@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
        void *call_private, ...);
-extern void marker_probe_cb_noarg(const struct marker *mdata,
-       void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
@@ -162,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 
 /*
  * marker_synchronize_unregister must be called between the last marker probe
- * unregistration and the end of module exit to make sure there is no caller
- * executing a probe when it is freed.
+ * unregistration and the first one of
+ * - the end of module exit function
+ * - the free of any resource used by the probes
+ * to ensure the code and data are valid for any possibly running probes.
  */
 #define marker_synchronize_unregister() synchronize_sched()
 
index bc6da10..7a0e5c4 100644 (file)
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 /*
  * NOTE: mutex_trylock() follows the spin_trylock() convention,
  *       not the down_trylock() convention!
+ *
+ * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
index d7e98ff..bb206c5 100644 (file)
@@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_task(pid, type, task)                              \
        do {                                                            \
                struct hlist_node *pos___;                              \
-               if (pid != NULL)                                        \
+               if ((pid) != NULL)                                      \
                        hlist_for_each_entry_rcu((task), pos___,        \
-                               &pid->tasks[type], pids[type].node) {
+                               &(pid)->tasks[type], pids[type].node) {
 
                        /*
                         * Both old and new leaders may be attached to
index 5f89b62..301dda8 100644 (file)
@@ -41,7 +41,7 @@
 #include <linux/seqlock.h>
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   ( 3 * HZ) /* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ) /* for rcp->jiffies_stall */
 #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
index 86f1f5e..895dc9c 100644 (file)
@@ -142,6 +142,7 @@ struct rcu_head {
  * on the write-side to insure proper synchronization.
  */
 #define rcu_read_lock_sched() preempt_disable()
+#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
 
 /*
  * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
  * See rcu_read_lock_sched for more information.
  */
 #define rcu_read_unlock_sched() preempt_enable()
+#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
 
 
 
index e097c2e..d363467 100644 (file)
@@ -28,17 +28,19 @@ struct ring_buffer_event {
  *                              size = 8 bytes
  *
  * @RINGBUF_TYPE_TIME_STAMP:   Sync time stamp with external clock
- *                              array[0] = tv_nsec
- *                              array[1] = tv_sec
+ *                              array[0]    = tv_nsec
+ *                              array[1..2] = tv_sec
  *                              size = 16 bytes
  *
  * @RINGBUF_TYPE_DATA:         Data record
  *                              If len is zero:
  *                               array[0] holds the actual length
- *                               array[1..(length+3)/4-1] holds data
+ *                               array[1..(length+3)/4] holds data
+ *                               size = 4 + 4 + length (bytes)
  *                              else
  *                               length = len << 2
- *                               array[0..(length+3)/4] holds data
+ *                               array[0..(length+3)/4-1] holds data
+ *                               size = 4 + length (bytes)
  */
 enum ring_buffer_type {
        RINGBUF_TYPE_PADDING,
@@ -122,6 +124,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
 void tracing_on(void);
 void tracing_off(void);
+void tracing_off_permanent(void);
+
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+int ring_buffer_read_page(struct ring_buffer *buffer,
+                         void **data_page, int cpu, int full);
 
 enum ring_buffer_flags {
        RB_FL_OVERWRITE         = 1 << 0,
index 8cccd6d..4240f6b 100644 (file)
@@ -96,6 +96,7 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
+struct bts_tracer;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -249,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 extern int runqueue_is_locked(void);
 extern void task_rq_unlock_wait(struct task_struct *p);
 
-extern cpumask_t nohz_cpu_mask;
+extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 #else
@@ -775,7 +776,6 @@ enum cpu_idle_type {
 
 struct sched_group {
        struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
 
        /*
         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -788,8 +788,15 @@ struct sched_group {
         * (see include/linux/reciprocal_div.h)
         */
        u32 reciprocal_cpu_power;
+
+       unsigned long cpumask[];
 };
 
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+       return to_cpumask(sg->cpumask);
+}
+
 enum sched_domain_level {
        SD_LV_NONE = 0,
        SD_LV_SIBLING,
@@ -813,7 +820,6 @@ struct sched_domain {
        struct sched_domain *parent;    /* top domain must be null terminated */
        struct sched_domain *child;     /* bottom domain must be null terminated */
        struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
        unsigned long min_interval;     /* Minimum balance interval ms */
        unsigned long max_interval;     /* Maximum balance interval ms */
        unsigned int busy_factor;       /* less balancing by factor if busy */
@@ -868,9 +874,17 @@ struct sched_domain {
 #ifdef CONFIG_SCHED_DEBUG
        char *name;
 #endif
+
+       /* span of all CPUs in this domain */
+       unsigned long span[];
 };
 
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+       return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
@@ -879,7 +893,7 @@ extern int arch_reinit_sched_domains(void);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                        struct sched_domain_attr *dattr_new)
 {
 }
@@ -961,7 +975,7 @@ struct sched_class {
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
        void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
 
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
@@ -1163,6 +1177,18 @@ struct task_struct {
        struct list_head ptraced;
        struct list_head ptrace_entry;
 
+#ifdef CONFIG_X86_PTRACE_BTS
+       /*
+        * This is the tracer handle for the ptrace BTS extension.
+        * This field actually belongs to the ptracer task.
+        */
+       struct bts_tracer *bts;
+       /*
+        * The buffer to hold the BTS data.
+        */
+       void *bts_buffer;
+#endif /* CONFIG_X86_PTRACE_BTS */
+
        /* PID/PID hash table linkage. */
        struct pid_link pids[PIDTYPE_MAX];
        struct list_head thread_group;
@@ -1354,6 +1380,23 @@ struct task_struct {
        unsigned long default_timer_slack_ns;
 
        struct list_head        *scm_work_list;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* Index of current stored adress in ret_stack */
+       int curr_ret_stack;
+       /* Stack of return addresses for return function tracing */
+       struct ftrace_ret_stack *ret_stack;
+       /*
+        * Number of functions that haven't been traced
+        * because of depth overrun.
+        */
+       atomic_t trace_overrun;
+       /* Pause for the tracing */
+       atomic_t tracing_graph_pause;
+#endif
+#ifdef CONFIG_TRACING
+       /* state flags for use by tracers */
+       unsigned long trace;
+#endif
 };
 
 /*
@@ -1592,12 +1635,12 @@ extern cputime_t task_gtime(struct task_struct *p);
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
 #else
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
 {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
 }
@@ -2210,8 +2253,8 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
-extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 extern int sched_mc_power_savings, sched_smt_power_savings;