Merge git://git.linux-nfs.org/projects/trondmy/nfs-2.6
Linus Torvalds [Fri, 11 Sep 2009 23:39:11 +0000 (16:39 -0700)]
* git://git.linux-nfs.org/projects/trondmy/nfs-2.6: (87 commits)
  NFSv4: Disallow 'mount -t nfs4 -overs=2' and 'mount -t nfs4 -overs=3'
  NFS: Allow the "nfs" file system type to support NFSv4
  NFS: Move details of nfs4_get_sb() to a helper
  NFS: Refactor NFSv4 text-based mount option validation
  NFS: Mount option parser should detect missing "port="
  NFS: out of date comment regarding O_EXCL above nfs3_proc_create()
  NFS: Handle a zero-length auth flavor list
  SUNRPC: Ensure that sunrpc gets initialised before nfs, lockd, etc...
  nfs: fix compile error in rpc_pipefs.h
  nfs: Remove reference to generic_osync_inode from a comment
  SUNRPC: cache must take a reference to the cache detail's module on open()
  NFS: Use the DNS resolver in the mount code.
  NFS: Add a dns resolver for use with NFSv4 referrals and migration
  SUNRPC: Fix a typo in cache_pipefs_files
  nfs: nfs4xdr: optimize low level decoding
  nfs: nfs4xdr: get rid of READ_BUF
  nfs: nfs4xdr: simplify decode_exchange_id by reusing decode_opaque_inline
  nfs: nfs4xdr: get rid of COPYMEM
  nfs: nfs4xdr: introduce decode_sessionid helper
  nfs: nfs4xdr: introduce decode_verifier helper
  ...

278 files changed:
Documentation/RCU/RTFP.txt
Documentation/RCU/UP.txt
Documentation/RCU/checklist.txt
Documentation/RCU/rcu.txt
Documentation/RCU/rcubarrier.txt
Documentation/RCU/torture.txt
Documentation/RCU/trace.txt
Documentation/RCU/whatisRCU.txt
Documentation/feature-removal-schedule.txt
Documentation/kernel-parameters.txt
Documentation/trace/events.txt
Documentation/trace/ftrace.txt
Documentation/trace/function-graph-fold.vim [new file with mode: 0644]
Documentation/trace/ring-buffer-design.txt [new file with mode: 0644]
arch/Kconfig
arch/ia64/include/asm/dma-mapping.h
arch/ia64/xen/time.c
arch/m68k/include/asm/entry_mm.h
arch/m68k/include/asm/entry_no.h
arch/m68k/include/asm/math-emu.h
arch/m68k/include/asm/thread_info_mm.h
arch/m68k/kernel/asm-offsets.c
arch/m68k/kernel/entry.S
arch/m68k/math-emu/fp_entry.S
arch/powerpc/include/asm/dma-mapping.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/include/asm/spinlock.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/dma-swiotlb.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/perf_callchain.c [new file with mode: 0644]
arch/powerpc/mm/slb.c
arch/powerpc/mm/stab.c
arch/s390/Kconfig
arch/s390/defconfig
arch/s390/include/asm/spinlock.h
arch/s390/include/asm/thread_info.h
arch/s390/kernel/entry.S
arch/s390/kernel/entry64.S
arch/s390/kernel/ftrace.c
arch/s390/kernel/ptrace.c
arch/sparc/Kconfig
arch/sparc/include/asm/dma-mapping.h
arch/sparc/include/asm/irq_64.h
arch/sparc/include/asm/pci.h
arch/sparc/include/asm/pci_32.h
arch/sparc/include/asm/pci_64.h
arch/sparc/include/asm/spinlock_32.h
arch/sparc/include/asm/spinlock_64.h
arch/sparc/kernel/Makefile
arch/sparc/kernel/dma.c
arch/sparc/kernel/dma.h [deleted file]
arch/sparc/kernel/iommu.c
arch/sparc/kernel/ioport.c
arch/sparc/kernel/pci.c
arch/sparc/kernel/pci_sun4v.c
arch/sparc/kernel/process_64.c
arch/x86/Kconfig
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/include/asm/amd_iommu.h
arch/x86/include/asm/amd_iommu_types.h
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/nmi.h
arch/x86/include/asm/perf_counter.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/unistd_32.h
arch/x86/include/asm/unistd_64.h
arch/x86/kernel/amd_iommu.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/apic/nmi.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kernel/pci-nommu.c
arch/x86/kernel/pci-swiotlb.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/sys_x86_64.c
arch/x86/oprofile/nmi_int.c
arch/x86/oprofile/op_counter.h
arch/x86/oprofile/op_model_amd.c
arch/x86/oprofile/op_model_p4.c
arch/x86/oprofile/op_model_ppro.c
arch/x86/oprofile/op_x86_model.h
arch/x86/pci/direct.c
drivers/acpi/blacklist.c
drivers/ata/Kconfig
drivers/ata/Makefile
drivers/ata/ahci.c
drivers/ata/libata-acpi.c
drivers/ata/libata-core.c
drivers/ata/libata-eh.c
drivers/ata/libata-pmp.c
drivers/ata/libata-scsi.c
drivers/ata/libata.h
drivers/ata/pata_atiixp.c
drivers/ata/pata_cs5535.c
drivers/ata/pata_octeon_cf.c
drivers/ata/pata_platform.c
drivers/ata/pata_rb532_cf.c
drivers/ata/pata_rdc.c [new file with mode: 0644]
drivers/ata/pata_rz1000.c
drivers/ata/sata_fsl.c
drivers/ata/sata_inic162x.c
drivers/ata/sata_mv.c
drivers/ata/sata_sil.c
drivers/ata/sata_sil24.c
drivers/ata/sata_sis.c
drivers/char/sysrq.c
drivers/firmware/dmi_scan.c
drivers/ide/atiixp.c
drivers/oprofile/cpu_buffer.c
drivers/oprofile/oprof.c
drivers/oprofile/oprof.h
drivers/oprofile/oprofile_files.c
drivers/oprofile/oprofile_stats.c
drivers/oprofile/oprofile_stats.h
drivers/pci/intr_remapping.c
drivers/pci/quirks.c
fs/dcache.c
fs/locks.c
include/asm-generic/dma-mapping-common.h
include/linux/ata.h
include/linux/cpu.h
include/linux/dma-mapping.h
include/linux/dmi.h
include/linux/ftrace_event.h
include/linux/hardirq.h
include/linux/init_task.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/irqnr.h
include/linux/kernel.h
include/linux/libata.h
include/linux/lockdep.h
include/linux/module.h
include/linux/nmi.h
include/linux/oprofile.h
include/linux/pagemap.h
include/linux/pci_ids.h
include/linux/perf_counter.h
include/linux/rcuclassic.h [deleted file]
include/linux/rcupdate.h
include/linux/rcupreempt.h [deleted file]
include/linux/rcupreempt_trace.h [deleted file]
include/linux/rcutree.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/spinlock.h
include/linux/spinlock_api_smp.h
include/linux/swiotlb.h
include/linux/syscalls.h
include/linux/topology.h
include/linux/tracepoint.h
include/trace/define_trace.h
include/trace/events/module.h [new file with mode: 0644]
include/trace/events/sched.h
include/trace/events/syscalls.h [new file with mode: 0644]
include/trace/ftrace.h
include/trace/syscall.h
init/Kconfig
init/main.c
kernel/Makefile
kernel/exit.c
kernel/fork.c
kernel/futex.c
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/manage.c
kernel/irq/pm.c
kernel/irq/resend.c
kernel/irq/spurious.c
kernel/kmod.c
kernel/kprobes.c
kernel/kthread.c
kernel/lockdep.c
kernel/lockdep_internals.h
kernel/lockdep_proc.c
kernel/module.c
kernel/perf_counter.c
kernel/printk.c
kernel/rcuclassic.c [deleted file]
kernel/rcupdate.c
kernel/rcupreempt.c [deleted file]
kernel/rcupreempt_trace.c [deleted file]
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h [new file with mode: 0644]
kernel/rcutree_trace.c
kernel/sched.c
kernel/sched_cpupri.c
kernel/sched_debug.c
kernel/sched_fair.c
kernel/sched_features.h
kernel/sched_rt.c
kernel/softirq.c
kernel/spinlock.c
kernel/sysctl.c
kernel/timer.c
kernel/trace/Kconfig
kernel/trace/blktrace.c
kernel/trace/ftrace.c
kernel/trace/kmemtrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_boot.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_export.c
kernel/trace/trace_functions.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_irqsoff.c
kernel/trace/trace_mmiotrace.c
kernel/trace/trace_power.c
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
kernel/trace/trace_selftest.c
kernel/trace/trace_stack.c
kernel/trace/trace_stat.c
kernel/trace/trace_stat.h
kernel/trace/trace_syscalls.c
kernel/trace/trace_workqueue.c
kernel/tracepoint.c
kernel/workqueue.c
lib/Kconfig.debug
lib/swiotlb.c
scripts/recordmcount.pl
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Makefile
tools/perf/builtin-annotate.c
tools/perf/builtin-help.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-stat.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c [new file with mode: 0644]
tools/perf/builtin.h
tools/perf/perf.c
tools/perf/util/abspath.c
tools/perf/util/cache.h
tools/perf/util/callchain.c
tools/perf/util/callchain.h
tools/perf/util/color.c
tools/perf/util/color.h
tools/perf/util/config.c
tools/perf/util/debug.c [new file with mode: 0644]
tools/perf/util/debug.h [new file with mode: 0644]
tools/perf/util/event.h [new file with mode: 0644]
tools/perf/util/exec_cmd.c
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/map.c [new file with mode: 0644]
tools/perf/util/module.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-options.c
tools/perf/util/path.c
tools/perf/util/run-command.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/thread.c [new file with mode: 0644]
tools/perf/util/thread.h [new file with mode: 0644]
tools/perf/util/trace-event-info.c [new file with mode: 0644]
tools/perf/util/trace-event-parse.c [new file with mode: 0644]
tools/perf/util/trace-event-read.c [new file with mode: 0644]
tools/perf/util/trace-event.h [new file with mode: 0644]
tools/perf/util/util.h
tools/perf/util/values.c [new file with mode: 0644]
tools/perf/util/values.h [new file with mode: 0644]

index 9f711d2..d2b8523 100644 (file)
@@ -743,3 +743,80 @@ Revised:
        RCU, realtime RCU, sleepable RCU, performance.
 "
 }
+
+@article{PaulEMcKenney2008RCUOSR
+,author="Paul E. McKenney and Jonathan Walpole"
+,title="Introducing technology into the {Linux} kernel: a case study"
+,Year="2008"
+,journal="SIGOPS Oper. Syst. Rev."
+,volume="42"
+,number="5"
+,pages="4--17"
+,issn="0163-5980"
+,doi={http://doi.acm.org/10.1145/1400097.1400099}
+,publisher="ACM"
+,address="New York, NY, USA"
+,annotation={
+       Linux changed RCU to a far greater degree than RCU has changed Linux.
+}
+}
+
+@unpublished{PaulEMcKenney2008HierarchicalRCU
+,Author="Paul E. McKenney"
+,Title="Hierarchical {RCU}"
+,month="November"
+,day="3"
+,year="2008"
+,note="Available:
+\url{http://lwn.net/Articles/305782/}
+[Viewed November 6, 2008]"
+,annotation="
+       RCU with combining-tree-based grace-period detection,
+       permitting it to handle thousands of CPUs.
+"
+}
+
+@conference{PaulEMcKenney2009MaliciousURCU
+,Author="Paul E. McKenney"
+,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms"
+,Booktitle="linux.conf.au 2009"
+,month="January"
+,year="2009"
+,address="Hobart, Australia"
+,note="Available:
+\url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf}
+[Viewed February 2, 2009]"
+,annotation="
+       Realtime RCU and torture-testing RCU uses.
+"
+}
+
+@unpublished{MathieuDesnoyers2009URCU
+,Author="Mathieu Desnoyers"
+,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}"
+,month="February"
+,day="5"
+,year="2009"
+,note="Available:
+\url{http://lkml.org/lkml/2009/2/5/572}
+\url{git://lttng.org/userspace-rcu.git}
+[Viewed February 20, 2009]"
+,annotation="
+       Mathieu Desnoyers's user-space RCU implementation.
+       git://lttng.org/userspace-rcu.git
+"
+}
+
+@unpublished{PaulEMcKenney2009BloatWatchRCU
+,Author="Paul E. McKenney"
+,Title="{RCU}: The {Bloatwatch} Edition"
+,month="March"
+,day="17"
+,year="2009"
+,note="Available:
+\url{http://lwn.net/Articles/323929/}
+[Viewed March 20, 2009]"
+,annotation="
+       Uniprocessor assumptions allow simplified RCU implementation.
+"
+}
index aab4a9e..90ec534 100644 (file)
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems
 
 
 A common misconception is that, on UP systems, the call_rcu() primitive
-may immediately invoke its function, and that the synchronize_rcu()
-primitive may return immediately.  The basis of this misconception
+may immediately invoke its function.  The basis of this misconception
 is that since there is only one CPU, it should not be necessary to
 wait for anything else to get done, since there are no other CPUs for
 anything else to be happening on.  Although this approach will -sort- -of-
 work a surprising amount of the time, it is a very bad idea in general.
-This document presents three examples that demonstrate exactly how bad an
-idea this is.
+This document presents three examples that demonstrate exactly how bad
+an idea this is.
 
 
 Example 1: softirq Suicide
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect?
 
 Summary
 
-Permitting call_rcu() to immediately invoke its arguments or permitting
-synchronize_rcu() to immediately return breaks RCU, even on a UP system.
-So do not do it!  Even on a UP system, the RCU infrastructure -must-
-respect grace periods, and -must- invoke callbacks from a known environment
-in which no locks are held.
+Permitting call_rcu() to immediately invoke its arguments breaks RCU,
+even on a UP system.  So do not do it!  Even on a UP system, the RCU
+infrastructure -must- respect grace periods, and -must- invoke callbacks
+from a known environment in which no locks are held.
+
+It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
+immediately on an UP system.  It is also safe for synchronize_rcu()
+to return immediately on UP systems, except when running preemptable
+RCU.
+
+Quick Quiz #3: Why can't synchronize_rcu() return immediately on
+       UP systems running preemptable RCU?
 
 
 Answer to Quick Quiz #1:
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2:
        callbacks acquire locks directly.  However, a great many RCU
        callbacks do acquire locks -indirectly-, for example, via
        the kfree() primitive.
+
+Answer to Quick Quiz #3:
+       Why can't synchronize_rcu() return immediately on UP systems
+       running preemptable RCU?
+
+       Because some other task might have been preempted in the middle
+       of an RCU read-side critical section.  If synchronize_rcu()
+       simply immediately returned, it would prematurely signal the
+       end of the grace period, which would come as a nasty shock to
+       that other thread when it started running again.
index accfe2f..51525a3 100644 (file)
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome!
        structure is updated more than about 10% of the time, then
        you should strongly consider some other approach, unless
        detailed performance measurements show that RCU is nonetheless
-       the right tool for the job.
+       the right tool for the job.  Yes, you might think of RCU
+       as simply cutting overhead off of the readers and imposing it
+       on the writers.  That is exactly why normal uses of RCU will
+       do much more reading than updating.
 
        Another exception is where performance is not an issue, and RCU
        provides a simpler implementation.  An example of this situation
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome!
        instead need to use synchronize_irq() or synchronize_sched().
 
 12.    Any lock acquired by an RCU callback must be acquired elsewhere
-       with irq disabled, e.g., via spin_lock_irqsave().  Failing to
-       disable irq on a given acquisition of that lock will result in
-       deadlock as soon as the RCU callback happens to interrupt that
-       acquisition's critical section.
+       with softirq disabled, e.g., via spin_lock_irqsave(),
+       spin_lock_bh(), etc.  Failing to disable irq on a given
+       acquisition of that lock will result in deadlock as soon as the
+       RCU callback happens to interrupt that acquisition's critical
+       section.
 
 13.    RCU callbacks can be and are executed in parallel.  In many cases,
        the callback code simply wrappers around kfree(), so that this
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome!
        Because these primitives only wait for pre-existing readers,
        it is the caller's responsibility to guarantee safety to
        any subsequent readers.
+
+16.    The various RCU read-side primitives do -not- contain memory
+       barriers.  The CPU (and in some cases, the compiler) is free
+       to reorder code into and out of RCU read-side critical sections.
+       It is the responsibility of the RCU update-side primitives to
+       deal with this.
index 7aa2002..2a23523 100644 (file)
@@ -36,7 +36,7 @@ o     How can the updater tell when a grace period has completed
        executed in user mode, or executed in the idle loop, we can
        safely free up that item.
 
-       Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
+       Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
        same effect, but require that the readers manipulate CPU-local
        counters.  These counters allow limited types of blocking
        within RCU read-side critical sections.  SRCU also uses
@@ -79,10 +79,10 @@ o   I hear that RCU is patented?  What is with that?
 o      I hear that RCU needs work in order to support realtime kernels?
 
        This work is largely completed.  Realtime-friendly RCU can be
-       enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
-       However, work is in progress for enabling priority boosting of
-       preempted RCU read-side critical sections.  This is needed if you
-       have CPU-bound realtime threads.
+       enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration
+       parameter.  However, work is in progress for enabling priority
+       boosting of preempted RCU read-side critical sections.  This is
+       needed if you have CPU-bound realtime threads.
 
 o      Where can I find more information on RCU?
 
index 909602d..e439a0e 100644 (file)
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all
 the timers, and only then invoke rcu_barrier() to wait for any remaining
 RCU callbacks to complete.
 
+Of course, if you module uses call_rcu_bh(), you will need to invoke
+rcu_barrier_bh() before unloading.  Similarly, if your module uses
+call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
+unloading.  If your module uses call_rcu(), call_rcu_bh(), -and-
+call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
+rcu_barrier_bh(), and rcu_barrier_sched().
+
 
 Implementing rcu_barrier()
 
index a342b6e..9dba3bb 100644 (file)
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
                "rcu_sync" for rcu_read_lock() with synchronous reclamation,
                "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for
                rcu_read_lock_bh() with synchronous reclamation, "srcu" for
-               the "srcu_read_lock()" API, and "sched" for the use of
-               preempt_disable() together with synchronize_sched().
+               the "srcu_read_lock()" API, "sched" for the use of
+               preempt_disable() together with synchronize_sched(),
+               and "sched_expedited" for the use of preempt_disable()
+               with synchronize_sched_expedited().
 
 verbose                Enable debug printk()s.  Default is disabled.
 
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU.  The
 "idx" value maps the "old" and "current" values to the underlying array,
 and is useful for debugging.
 
+Similarly, sched_expedited RCU provides the following:
+
+       sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319
+       sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
+       sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
+       sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
+       state: -1 / 0:0 3:0 4:0
+
+As before, the first four lines are similar to those for RCU.
+The last line shows the task-migration state.  The first number is
+-1 if synchronize_sched_expedited() is idle, -2 if in the process of
+posting wakeups to the migration kthreads, and N when waiting on CPU N.
+Each of the colon-separated fields following the "/" is a CPU:state pair.
+Valid states are "0" for idle, "1" for waiting for quiescent state,
+"2" for passed through quiescent state, and "3" when a race with a
+CPU-hotplug event forces use of the synchronize_sched() primitive.
+
 
 USAGE
 
index 02cced1..187bbf1 100644 (file)
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
 
 The output of "cat rcu/rcudata" looks as follows:
 
-rcu:
-rcu:
+rcu_sched:
   0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
   1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
   2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format.
 
 The output of "cat rcu/rcugp" looks as follows:
 
-rcu: completed=33062  gpnum=33063
+rcu_sched: completed=33062  gpnum=33063
 rcu_bh: completed=464  gpnum=464
 
 Again, this output is for both "rcu" and "rcu_bh".  The fields are
@@ -413,7 +412,7 @@ o   Each element of the form "1/1 0:127 ^0" represents one struct
 
 The output of "cat rcu/rcu_pending" looks as follows:
 
-rcu:
+rcu_sched:
   0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
   1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
   2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
index 9617082..e41a7fe 100644 (file)
@@ -136,10 +136,10 @@ rcu_read_lock()
        Used by a reader to inform the reclaimer that the reader is
        entering an RCU read-side critical section.  It is illegal
        to block while in an RCU read-side critical section, though
-       kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side
-       critical sections.  Any RCU-protected data structure accessed
-       during an RCU read-side critical section is guaranteed to remain
-       unreclaimed for the full duration of that critical section.
+       kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU
+       read-side critical sections.  Any RCU-protected data structure
+       accessed during an RCU read-side critical section is guaranteed to
+       remain unreclaimed for the full duration of that critical section.
        Reference counts may be used in conjunction with RCU to maintain
        longer-term references to data structures.
 
@@ -785,6 +785,7 @@ RCU pointer/list traversal:
        rcu_dereference
        list_for_each_entry_rcu
        hlist_for_each_entry_rcu
+       hlist_nulls_for_each_entry_rcu
 
        list_for_each_continue_rcu      (to be deprecated in favor of new
                                         list_for_each_entry_continue_rcu)
@@ -807,19 +808,23 @@ RCU:      Critical sections       Grace period            Barrier
 
        rcu_read_lock           synchronize_net         rcu_barrier
        rcu_read_unlock         synchronize_rcu
+                               synchronize_rcu_expedited
                                call_rcu
 
 
 bh:    Critical sections       Grace period            Barrier
 
        rcu_read_lock_bh        call_rcu_bh             rcu_barrier_bh
-       rcu_read_unlock_bh
+       rcu_read_unlock_bh      synchronize_rcu_bh
+                               synchronize_rcu_bh_expedited
 
 
 sched: Critical sections       Grace period            Barrier
 
-       [preempt_disable]       synchronize_sched       rcu_barrier_sched
-       [and friends]           call_rcu_sched
+       rcu_read_lock_sched     synchronize_sched       rcu_barrier_sched
+       rcu_read_unlock_sched   call_rcu_sched
+       [preempt_disable]       synchronize_sched_expedited
+       [and friends]
 
 
 SRCU:  Critical sections       Grace period            Barrier
@@ -827,6 +832,9 @@ SRCU:       Critical sections       Grace period            Barrier
        srcu_read_lock          synchronize_srcu        N/A
        srcu_read_unlock
 
+SRCU:  Initialization/cleanup
+       init_srcu_struct
+       cleanup_srcu_struct
 
 See the comment headers in the source code (or the docbook generated
 from them) for more information.
index f0690bb..bb3a53c 100644 (file)
@@ -206,24 +206,6 @@ Who:       Len Brown <len.brown@intel.com>
 
 ---------------------------
 
-What: libata spindown skipping and warning
-When: Dec 2008
-Why:  Some halt(8) implementations synchronize caches for and spin
-      down libata disks because libata didn't use to spin down disk on
-      system halt (only synchronized caches).
-      Spin down on system halt is now implemented.  sysfs node
-      /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if
-      spin down support is available.
-      Because issuing spin down command to an already spun down disk
-      makes some disks spin up just to spin down again, libata tracks
-      device spindown status to skip the extra spindown command and
-      warn about it.
-      This is to give userspace tools the time to get updated and will
-      be removed after userspace is reasonably updated.
-Who:  Tejun Heo <htejun@gmail.com>
-
----------------------------
-
 What:  i386/x86_64 bzImage symlinks
 When:  April 2010
 
@@ -394,15 +376,6 @@ Who:       Thomas Gleixner <tglx@linutronix.de>
 
 -----------------------------
 
-What:  obsolete generic irq defines and typedefs
-When:  2.6.30
-Why:   The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t)
-       have been kept around for migration reasons. After more than two years
-       it's time to remove them finally
-Who:   Thomas Gleixner <tglx@linutronix.de>
-
----------------------------
-
 What:  fakephp and associated sysfs files in /sys/bus/pci/slots/
 When:  2011
 Why:   In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
index ce88537..5d4427d 100644 (file)
@@ -2509,6 +2509,11 @@ and is between 256 and 4096 characters. It is defined in the file
        trace_buf_size=nn[KMG]
                        [FTRACE] will set tracing buffer size.
 
+       trace_event=[event-list]
+                       [FTRACE] Set and start specified trace events in order
+                       to facilitate early boot debugging.
+                       See also Documentation/trace/events.txt
+
        trix=           [HW,OSS] MediaTrix AudioTrix Pro
                        Format:
                        <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
index f157d75..2bcc8d4 100644 (file)
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
  X - there is a mixture of events enabled and disabled
  ? - this file does not affect any event
 
+2.3 Boot option
+---------------
+
+In order to facilitate early boot debugging, use boot option:
+
+       trace_event=[event-list]
+
+The format of this boot option is the same as described in section 2.1.
+
 3. Defining an event-enabled tracepoint
 =======================================
 
index a39b3c7..355d0f1 100644 (file)
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files:
        This file holds the output of the trace in a human
        readable format (described below).
 
-  latency_trace:
-
-       This file shows the same trace but the information
-       is organized more to display possible latencies
-       in the system (described below).
-
   trace_pipe:
 
        The output is the same as the "trace" file but this
        file is meant to be streamed with live tracing.
-       Reads from this file will block until new data
-       is retrieved. Unlike the "trace" and "latency_trace"
-       files, this file is a consumer. This means reading
-       from this file causes sequential reads to display
-       more current data. Once data is read from this
-       file, it is consumed, and will not be read
-       again with a sequential read. The "trace" and
-       "latency_trace" files are static, and if the
-       tracer is not adding more data, they will display
-       the same information every time they are read.
+       Reads from this file will block until new data is
+       retrieved.  Unlike the "trace" file, this file is a
+       consumer. This means reading from this file causes
+       sequential reads to display more current data. Once
+       data is read from this file, it is consumed, and
+       will not be read again with a sequential read. The
+       "trace" file is static, and if the tracer is not
+       adding more data,they will display the same
+       information every time they are read.
 
   trace_options:
 
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files:
        Some of the tracers record the max latency.
        For example, the time interrupts are disabled.
        This time is saved in this file. The max trace
-       will also be stored, and displayed by either
-       "trace" or "latency_trace".  A new max trace will
-       only be recorded if the latency is greater than
-       the value in this file. (in microseconds)
+       will also be stored, and displayed by "trace".
+       A new max trace will only be recorded if the
+       latency is greater than the value in this
+       file. (in microseconds)
 
   buffer_size_kb:
 
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured.
        the trace with the longest max latency.
        See tracing_max_latency. When a new max is recorded,
        it replaces the old trace. It is best to view this
-       trace via the latency_trace file.
+       trace with the latency-format option enabled.
 
   "preemptoff"
 
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0).
 Latency trace format
 --------------------
 
-For traces that display latency times, the latency_trace file
-gives somewhat more information to see why a latency happened.
+When the latency-format option is enabled, the trace file gives
+somewhat more information to see why a latency happened.
 Here is a typical trace.
 
 # tracer: irqsoff
@@ -380,9 +373,10 @@ explains which is which.
 
 The above is mostly meaningful for kernel developers.
 
-  time: This differs from the trace file output. The trace file output
-       includes an absolute timestamp. The timestamp used by the
-       latency_trace file is relative to the start of the trace.
+  time: When the latency-format option is enabled, the trace file
+       output includes a timestamp relative to the start of the
+       trace. This differs from the output when latency-format
+       is disabled, which includes an absolute timestamp.
 
   delay: This is just to help catch your eye a bit better. And
         needs to be fixed to be only relative to the same CPU.
@@ -440,7 +434,8 @@ Here are the available options:
   sym-addr:
    bash-4000  [01]  1477.606694: simple_strtoul <c0339346>
 
-  verbose - This deals with the latency_trace file.
+  verbose - This deals with the trace file when the
+            latency-format option is enabled.
 
     bash  4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \
     (+0.000ms): simple_strtoul (strict_strtoul)
@@ -472,7 +467,7 @@ Here are the available options:
                the app is no longer running
 
                The lookup is performed when you read
-               trace,trace_pipe,latency_trace. Example:
+               trace,trace_pipe. Example:
 
                a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
 x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
               every scheduling event. Will add overhead if
               there's a lot of tasks running at once.
 
+  latency-format - This option changes the trace. When
+                   it is enabled, the trace displays
+                   additional information about the
+                   latencies, as described in "Latency
+                   trace format".
 
 sched_switch
 ------------
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is
 an example:
 
  # echo irqsoff > current_tracer
+ # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
  # echo 1 > tracing_enabled
  # ls -ltr
  [...]
  # echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
 # tracer: irqsoff
 #
 irqsoff latency trace v1.1.5 on 2.6.26
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer
 is much like the irqsoff tracer.
 
  # echo preemptoff > current_tracer
+ # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
  # echo 1 > tracing_enabled
  # ls -ltr
  [...]
  # echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
 # tracer: preemptoff
 #
 preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff
 tracers.
 
  # echo preemptirqsoff > current_tracer
+ # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
  # echo 1 > tracing_enabled
  # ls -ltr
  [...]
  # echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
 # tracer: preemptirqsoff
 #
 preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under
 'chrt' which changes the priority of the task.
 
  # echo wakeup > current_tracer
+ # echo latency-format > trace_options
  # echo 0 > tracing_max_latency
  # echo 1 > tracing_enabled
  # chrt -f 5 sleep 1
  # echo 0 > tracing_enabled
- # cat latency_trace
+ # cat trace
 # tracer: wakeup
 #
 wakeup latency trace v1.1.5 on 2.6.26-rc8
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim
new file mode 100644 (file)
index 0000000..0544b50
--- /dev/null
@@ -0,0 +1,42 @@
+" Enable folding for ftrace function_graph traces.
+"
+" To use, :source this file while viewing a function_graph trace, or use vim's
+" -S option to load from the command-line together with a trace.  You can then
+" use the usual vim fold commands, such as "za", to open and close nested
+" functions.  While closed, a fold will show the total time taken for a call,
+" as would normally appear on the line with the closing brace.  Folded
+" functions will not include finish_task_switch(), so folding should remain
+" relatively sane even through a context switch.
+"
+" Note that this will almost certainly only work well with a
+" single-CPU trace (e.g. trace-cmd report --cpu 1).
+
+function! FunctionGraphFoldExpr(lnum)
+  let line = getline(a:lnum)
+  if line[-1:] == '{'
+    if line =~ 'finish_task_switch() {$'
+      return '>1'
+    endif
+    return 'a1'
+  elseif line[-1:] == '}'
+    return 's1'
+  else
+    return '='
+  endif
+endfunction
+
+function! FunctionGraphFoldText()
+  let s = split(getline(v:foldstart), '|', 1)
+  if getline(v:foldend+1) =~ 'finish_task_switch() {$'
+    let s[2] = ' task switch  '
+  else
+    let e = split(getline(v:foldend), '|', 1)
+    let s[2] = e[2]
+  endif
+  return join(s, '|')
+endfunction
+
+setlocal foldexpr=FunctionGraphFoldExpr(v:lnum)
+setlocal foldtext=FunctionGraphFoldText()
+setlocal foldcolumn=12
+setlocal foldmethod=expr
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644 (file)
index 0000000..5b1d23d
--- /dev/null
@@ -0,0 +1,955 @@
+               Lockless Ring Buffer Design
+               ===========================
+
+Copyright 2009 Red Hat Inc.
+   Author:   Steven Rostedt <srostedt@redhat.com>
+  License:   The GNU Free Documentation License, Version 1.2
+               (dual licensed under the GPL v2)
+Reviewers:   Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
+            and Frederic Weisbecker.
+
+
+Written for: 2.6.31
+
+Terminology used in this Document
+---------------------------------
+
+tail - where new writes happen in the ring buffer.
+
+head - where new reads happen in the ring buffer.
+
+producer - the task that writes into the ring buffer (same as writer)
+
+writer - same as producer
+
+consumer - the task that reads from the buffer (same as reader)
+
+reader - same as consumer.
+
+reader_page - A page outside the ring buffer used solely (for the most part)
+    by the reader.
+
+head_page - a pointer to the page that the reader will use next
+
+tail_page - a pointer to the page that will be written to next
+
+commit_page - a pointer to the page with the last finished non nested write.
+
+cmpxchg - hardware assisted atomic transaction that performs the following:
+
+   A = B iff previous A == C
+
+   R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
+      current A is equal to C, and we put the old (current) A into R
+
+   R gets the previous A regardless if A is updated with B or not.
+
+   To see if the update was successful a compare of R == C may be used.
+
+The Generic Ring Buffer
+-----------------------
+
+The ring buffer can be used in either an overwrite mode or in
+producer/consumer mode.
+
+Producer/consumer mode is where the producer were to fill up the
+buffer before the consumer could free up anything, the producer
+will stop writing to the buffer. This will lose most recent events.
+
+Overwrite mode is where the produce were to fill up the buffer
+before the consumer could free up anything, the producer will
+overwrite the older data. This will lose the oldest events.
+
+No two writers can write at the same time (on the same per cpu buffer),
+but a writer may interrupt another writer, but it must finish writing
+before the previous writer may continue. This is very important to the
+algorithm. The writers act like a "stack". The way interrupts works
+enforces this behavior.
+
+
+  writer1 start
+     <preempted> writer2 start
+         <preempted> writer3 start
+                     writer3 finishes
+                 writer2 finishes
+  writer1 finishes
+
+This is very much like a writer being preempted by an interrupt and
+the interrupt doing a write as well.
+
+Readers can happen at any time. But no two readers may run at the
+same time, nor can a reader preempt/interrupt another reader. A reader
+can not preempt/interrupt a writer, but it may read/consume from the
+buffer at the same time as a writer is writing, but the reader must be
+on another processor to do so. A reader may read on its own processor
+and can be preempted by a writer.
+
+A writer can preempt a reader, but a reader can not preempt a writer.
+But a reader can read the buffer at the same time (on another processor)
+as a writer.
+
+The ring buffer is made up of a list of pages held together by a link list.
+
+At initialization a reader page is allocated for the reader that is not
+part of the ring buffer.
+
+The head_page, tail_page and commit_page are all initialized to point
+to the same page.
+
+The reader page is initialized to have its next pointer pointing to
+the head page, and its previous pointer pointing to a page before
+the head page.
+
+The reader has its own page to use. At start up time, this page is
+allocated but is not attached to the list. When the reader wants
+to read from the buffer, if its page is empty (like it is on start up)
+it will swap its page with the head_page. The old reader page will
+become part of the ring buffer and the head_page will be removed.
+The page after the inserted page (old reader_page) will become the
+new head page.
+
+Once the new page is given to the reader, the reader could do what
+it wants with it, as long as a writer has left that page.
+
+A sample of how the reader page is swapped: Note this does not
+show the head page in the buffer, it is for demonstrating a swap
+only.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |
+  +------+
+                  +---+   +---+   +---+
+                  |   |-->|   |-->|   |
+                  |   |<--|   |<--|   |
+                  +---+   +---+   +---+
+                   ^ |             ^ |
+                   | +-------------+ |
+                   +-----------------+
+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------------------+
+  +------+                   v
+    |             +---+   +---+   +---+
+    |             |   |-->|   |-->|   |
+    |             |   |<--|   |<--|   |<-+
+    |             +---+   +---+   +---+  |
+    |              ^ |             ^ |   |
+    |              | +-------------+ |   |
+    |              +-----------------+   |
+    +------------------------------------+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------------------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |   |   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+  +------+
+  |buffer|          RING BUFFER
+  |page  |-------------------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |   |   |-->|   |
+    |  |  New     |   |   |   |<--|   |<-+
+    |  | Reader   +---+   +---+   +---+  |
+    |  |  page ----^                 |   |
+    |  |                             |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+
+
+It is possible that the page swapped is the commit page and the tail page,
+if what is in the ring buffer is less than what is held in a buffer page.
+
+
+          reader page    commit page   tail page
+              |              |             |
+              v              |             |
+             +---+           |             |
+             |   |<----------+             |
+             |   |<------------------------+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+This case is still valid for this algorithm.
+When the writer leaves the page, it simply goes into the ring buffer
+since the reader page still points to the next location in the ring
+buffer.
+
+
+The main pointers:
+
+  reader page - The page used solely by the reader and is not part
+                of the ring buffer (may be swapped in)
+
+  head page - the next page in the ring buffer that will be swapped
+              with the reader page.
+
+  tail page - the page where the next write will take place.
+
+  commit page - the page that last finished a write.
+
+The commit page only is updated by the outer most writer in the
+writer stack. A writer that preempts another writer will not move the
+commit page.
+
+When data is written into the ring buffer, a position is reserved
+in the ring buffer and passed back to the writer. When the writer
+is finished writing data into that position, it commits the write.
+
+Another write (or a read) may take place at anytime during this
+transaction. If another write happens it must finish before continuing
+with the previous write.
+
+
+   Write reserve:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <--- given back to writer (current commit)
+      |reserved |
+      +---------+ <--- tail pointer
+      | empty   |
+      +---------+
+
+   Write commit:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+  <--- next positon for write (current commit)
+      | empty   |
+      +---------+
+
+
+ If a write happens after the first reserve:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <-- current commit
+      |reserved |
+      +---------+  <--- given back to second writer
+      |reserved |
+      +---------+ <--- tail pointer
+
+  After second writer commits:
+
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+  <--(last full commit)
+      |reserved |
+      +---------+
+      |pending  |
+      |commit   |
+      +---------+ <--- tail pointer
+
+  When the first writer commits:
+
+       Buffer page
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+
+      |written  |
+      +---------+  <--(last full commit and tail pointer)
+
+
+The commit pointer points to the last write location that was
+committed without preempting another write. When a write that
+preempted another write is committed, it only becomes a pending commit
+and will not be a full commit till all writes have been committed.
+
+The commit page points to the page that has the last full commit.
+The tail page points to the page with the last write (before
+committing).
+
+The tail page is always equal to or after the commit page. It may
+be several pages ahead. If the tail page catches up to the commit
+page then no more writes may take place (regardless of the mode
+of the ring buffer: overwrite and produce/consumer).
+
+The order of pages are:
+
+ head page
+ commit page
+ tail page
+
+Possible scenario:
+                             tail page
+  head page         commit page  |
+      |                 |        |
+      v                 v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+There is a special case that the head page is after either the commit page
+and possibly the tail page. That is when the commit (and tail) page has been
+swapped with the reader page. This is because the head page is always
+part of the ring buffer, but the reader page is not. When ever there
+has been less than a full page that has been committed inside the ring buffer,
+and a reader swaps out a page, it will be swapping out the commit page.
+
+
+          reader page    commit page   tail page
+              |              |             |
+              v              |             |
+             +---+           |             |
+             |   |<----------+             |
+             |   |<------------------------+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                        ^
+                        |
+                    head page
+
+
+In this case, the head page will not move when the tail and commit
+move back into the ring buffer.
+
+The reader can not swap a page into the ring buffer if the commit page
+is still on that page. If the read meets the last commit (real commit
+not pending or reserved), then there is nothing more to read.
+The buffer is considered empty until another full commit finishes.
+
+When the tail meets the head page, if the buffer is in overwrite mode,
+the head page will be pushed ahead one. If the buffer is in producer/consumer
+mode, the write will fail.
+
+Overwrite mode:
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                        ^
+                        |
+                    head page
+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                                 ^
+                                 |
+                             head page
+
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+                                 ^
+                                 |
+                             head page
+
+Note, the reader page will still point to the previous head page.
+But when a swap takes place, it will use the most recent head page.
+
+
+Making the Ring Buffer Lockless:
+--------------------------------
+
+The main idea behind the lockless algorithm is to combine the moving
+of the head_page pointer with the swapping of pages with the reader.
+State flags are placed inside the pointer to the page. To do this,
+each page must be aligned in memory by 4 bytes. This will allow the 2
+least significant bits of the address to be used as flags. Since
+they will always be zero for the address. To get the address,
+simply mask out the flags.
+
+  MASK = ~3
+
+  address & MASK
+
+Two flags will be kept by these two bits:
+
+   HEADER - the page being pointed to is a head page
+
+   UPDATE - the page being pointed to is being updated by a writer
+          and was or is about to be a head page.
+
+
+          reader page
+              |
+              v
+             +---+
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The above pointer "-H->" would have the HEADER flag set. That is
+the next page is the next page to be swapped out by the reader.
+This pointer means the next page is the head page.
+
+When the tail page meets the head pointer, it will use cmpxchg to
+change the pointer to the UPDATE state:
+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+"-U->" represents a pointer in the UPDATE state.
+
+Any access to the reader will need to take some sort of lock to serialize
+the readers. But the writers will never take a lock to write to the
+ring buffer. This means we only need to worry about a single reader,
+and writes only preempt in "stack" formation.
+
+When the reader tries to swap the page with the ring buffer, it
+will also use cmpxchg. If the flag bit in the pointer to the
+head page does not have the HEADER flag set, the compare will fail
+and the reader will need to look for the new head page and try again.
+Note, the flag UPDATE and HEADER are never set at the same time.
+
+The reader swaps the reader page as follows:
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |
+  +------+
+                  +---+    +---+    +---+
+                  |   |--->|   |--->|   |
+                  |   |<---|   |<---|   |
+                  +---+    +---+    +---+
+                   ^ |               ^ |
+                   | +---------------+ |
+                   +-----H-------------+
+
+The reader sets the reader page next pointer as HEADER to the page after
+the head page.
+
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+                   v
+    |             +---+    +---+    +---+
+    |             |   |--->|   |--->|   |
+    |             |   |<---|   |<---|   |<-+
+    |             +---+    +---+    +---+  |
+    |              ^ |               ^ |   |
+    |              | +---------------+ |   |
+    |              +-----H-------------+   |
+    +--------------------------------------+
+
+It does a cmpxchg with the pointer to the previous head page to make it
+point to the reader page. Note that the new pointer does not have the HEADER
+flag set.  This action atomically moves the head page forward.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+                   v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |<--|   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+After the new head page is set, the previous pointer of the head page is
+updated to the reader page.
+
+  +------+
+  |reader|          RING BUFFER
+  |page  |-------H-----------+
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |-->|   |-->|   |
+    |  |          |   |   |   |<--|   |<-+
+    |  |          +---+   +---+   +---+  |
+    |  |             |             ^ |   |
+    |  |             +-------------+ |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+  +------+
+  |buffer|          RING BUFFER
+  |page  |-------H-----------+  <--- New head page
+  +------+ <---------------+ v
+    |  ^          +---+   +---+   +---+
+    |  |          |   |   |   |-->|   |
+    |  |  New     |   |   |   |<--|   |<-+
+    |  | Reader   +---+   +---+   +---+  |
+    |  |  page ----^                 |   |
+    |  |                             |   |
+    |  +-----------------------------+   |
+    +------------------------------------+
+
+Another important point. The page that the reader page points back to
+by its previous pointer (the one that now points to the new head page)
+never points back to the reader page. That is because the reader page is
+not part of the ring buffer. Traversing the ring buffer via the next pointers
+will always stay in the ring buffer. Traversing the ring buffer via the
+prev pointers may not.
+
+Note, the way to determine a reader page is simply by examining the previous
+pointer of the page. If the next pointer of the previous page does not
+point back to the original page, then the original page is a reader page:
+
+
+             +--------+
+             | reader |  next   +----+
+             |  page  |-------->|    |<====== (buffer page)
+             +--------+         +----+
+                 |                | ^
+                 |                v | next
+            prev |              +----+
+                 +------------->|    |
+                                +----+
+
+The way the head page moves forward:
+
+When the tail page meets the head page and the buffer is in overwrite mode
+and more writes take place, the head page must be moved forward before the
+writer may move the tail page. The way this is done is that the writer
+performs a cmpxchg to convert the pointer to the head page from the HEADER
+flag to have the UPDATE flag set. Once this is done, the reader will
+not be able to swap the head page from the buffer, nor will it be able to
+move the head page, until the writer is finished with the move.
+
+This eliminates any races that the reader can have on the writer. The reader
+must spin, and this is why the reader can not preempt the writer.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The following page will be made into the new head page.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the new head page has been set, we can set the old head page
+pointer back to NORMAL.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the head page has been moved, the tail page may now move forward.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The above are the trivial updates. Now for the more complex scenarios.
+
+
+As stated before, if enough writes preempt the first write, the
+tail page may make it all the way around the buffer and meet the commit
+page. At this time, we must start dropping writes (usually with some kind
+of warning to the user). But what happens if the commit was still on the
+reader page? The commit page is not part of the ring buffer. The tail page
+must account for this.
+
+
+          reader page    commit page
+              |              |
+              v              |
+             +---+           |
+             |   |<----------+
+             |   |
+             |   |------+
+             +---+      |
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+               ^
+               |
+           tail page
+
+If the tail page were to simply push the head page forward, the commit when
+leaving the reader page would not be pointing to the correct page.
+
+The solution to this is to test if the commit page is on the reader page
+before pushing the head page. If it is, then it can be assumed that the
+tail page wrapped the buffer, and we must drop new writes.
+
+This is not a race condition, because the commit page can only be moved
+by the outter most writer (the writer that was preempted).
+This means that the commit will not move while a writer is moving the
+tail page. The reader can not swap the reader page if it is also being
+used as the commit page. The reader can simply check that the commit
+is off the reader page. Once the commit page leaves the reader page
+it will never go back on it unless a reader does another swap with the
+buffer page that is also the commit page.
+
+
+Nested writes
+-------------
+
+In the pushing forward of the tail page we must first push forward
+the head page if the head page is the next page. If the head page
+is not the next page, the tail page is simply updated with a cmpxchg.
+
+Only writers move the tail page. This must be done atomically to protect
+against nested writers.
+
+  temp_page = tail_page
+  next_page = temp_page->next
+  cmpxchg(tail_page, temp_page, next_page)
+
+The above will update the tail page if it is still pointing to the expected
+page. If this fails, a nested write pushed it forward, the the current write
+does not need to push it.
+
+
+           temp page
+               |
+               v
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Nested write comes in and moves the tail page forward:
+
+                    tail page (moved by nested writer)
+            temp page   |
+               |        |
+               v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The above would fail the cmpxchg, but since the tail page has already
+been moved forward, the writer will just try again to reserve storage
+on the new tail page.
+
+But the moving of the head page is a bit more complex.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The write converts the head page pointer to UPDATE.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But if a nested writer preempts here. It will see that the next
+page is a head page, but it is also nested. It will detect that
+it is nested and will save that information. The detection is the
+fact that it sees the UPDATE flag instead of a HEADER or NORMAL
+pointer.
+
+The nested writer will set the new head page pointer.
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But it will not reset the update back to normal. Only the writer
+that converted a pointer from HEAD to UPDATE will convert it back
+to NORMAL.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+After the nested writer finishes, the outer most writer will convert
+the UPDATE pointer to NORMAL.
+
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+It can be even more complex if several nested writes came in and moved
+the tail page ahead several pages:
+
+
+(first writer)
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-H->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The write converts the head page pointer to UPDATE.
+
+            tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Next writer comes in, and sees the update and sets up the new
+head page.
+
+(second writer)
+
+           tail page
+               |
+               v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The nested writer moves the tail page forward. But does not set the old
+update page to NORMAL because it is not the outer most writer.
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Another writer preempts and sees the page after the tail page is a head page.
+It changes it from HEAD to UPDATE.
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-U->|   |--->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The writer will move the head page forward:
+
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-U->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+But now that the third writer did change the HEAD flag to UPDATE it
+will convert it to normal:
+
+
+(third writer)
+
+                    tail page
+                        |
+                        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+Then it will move the tail page, and return back to the second writer.
+
+
+(second writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+
+The second writer will fail to move the tail page because it was already
+moved, so it will try again and add its data to the new tail page.
+It will return to the first writer.
+
+
+(first writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+The first writer can not know atomically test if the tail page moved
+while it updates the HEAD page. It will then update the head page to
+what it thinks is the new head page.
+
+
+(first writer)
+
+                             tail page
+                                 |
+                                 v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Since the cmpxchg returns the old value of the pointer the first writer
+will see it succeeded in updating the pointer from NORMAL to HEAD.
+But as we can see, this is not good enough. It must also check to see
+if the tail page is either where it use to be or on the next page:
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |-H->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+If tail page != A and tail page does not equal B, then it must reset the
+pointer back to NORMAL. The fact that it only needs to worry about
+nested writers, it only needs to check this after setting the HEAD page.
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |-U->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
+Now the writer can update the head page. This is also why the head page must
+remain in UPDATE and only reset by the outer most writer. This prevents
+the reader from seeing the incorrect head page.
+
+
+(first writer)
+
+               A        B    tail page
+               |        |        |
+               v        v        v
+    +---+    +---+    +---+    +---+
+<---|   |--->|   |--->|   |--->|   |-H->
+--->|   |<---|   |<---|   |<---|   |<---
+    +---+    +---+    +---+    +---+
+
index 99193b1..beea3cc 100644 (file)
@@ -30,6 +30,18 @@ config OPROFILE_IBS
 
          If unsure, say N.
 
+config OPROFILE_EVENT_MULTIPLEX
+       bool "OProfile multiplexing support (EXPERIMENTAL)"
+       default n
+       depends on OPROFILE && X86
+       help
+         The number of hardware counters is limited. The multiplexing
+         feature enables OProfile to gather more events than counters
+         are provided by the hardware. This is realized by switching
+         between events at an user specified time interval.
+
+         If unsure, say N.
+
 config HAVE_OPROFILE
        bool
 
index 5a61b5c..8d3c79c 100644 (file)
@@ -44,7 +44,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
 #define get_dma_ops(dev) platform_dma_get_ops(dev)
-#define flush_write_buffers()
 
 #include <asm-generic/dma-mapping-common.h>
 
@@ -69,6 +68,24 @@ dma_set_mask (struct device *dev, u64 mask)
        return 0;
 }
 
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+       if (!dev->dma_mask)
+               return 0;
+
+       return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+       return paddr;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+       return daddr;
+}
+
 extern int dma_get_cache_alignment(void);
 
 static inline void
index fb83326..dbeadb9 100644 (file)
@@ -133,8 +133,7 @@ consider_steal_time(unsigned long new_itm)
                account_idle_ticks(blocked);
                run_local_timers();
 
-               if (rcu_pending(cpu))
-                       rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
+               rcu_check_callbacks(cpu, user_mode(get_irq_regs()));
 
                scheduler_tick();
                run_posix_cpu_timers(p);
index 5202f5a..4741258 100644 (file)
@@ -46,7 +46,6 @@
 #define curptr a2
 
 LFLUSH_I_AND_D = 0x00000808
-LSIGTRAP = 5
 
 /* process bits for task_struct.ptrace */
 PT_TRACESYS_OFF = 3
@@ -118,9 +117,6 @@ PT_DTRACE_BIT = 2
 #define STR(X) STR1(X)
 #define STR1(X) #X
 
-#define PT_OFF_ORIG_D0  0x24
-#define PT_OFF_FORMATVEC 0x32
-#define PT_OFF_SR       0x2C
 #define SAVE_ALL_INT                           \
        "clrl   %%sp@-;"    /* stk_adj */       \
        "pea    -1:w;"      /* orig d0 = -1 */  \
index c2553d2..907ed03 100644 (file)
@@ -72,8 +72,8 @@ LENOSYS = 38
        lea     %sp@(-32),%sp           /* space for 8 regs */
        moveml  %d1-%d5/%a0-%a2,%sp@
        movel   sw_usp,%a0              /* get usp */
-       movel   %a0@-,%sp@(PT_PC)       /* copy exception program counter */
-       movel   %a0@-,%sp@(PT_FORMATVEC)/* copy exception format/vector/sr */
+       movel   %a0@-,%sp@(PT_OFF_PC)   /* copy exception program counter */
+       movel   %a0@-,%sp@(PT_OFF_FORMATVEC)/*copy exception format/vector/sr */
        bra     7f
        6:
        clrl    %sp@-                   /* stkadj */
@@ -89,8 +89,8 @@ LENOSYS = 38
        bnes    8f                      /* no, skip */
        move    #0x2700,%sr             /* disable intrs */
        movel   sw_usp,%a0              /* get usp */
-       movel   %sp@(PT_PC),%a0@-       /* copy exception program counter */
-       movel   %sp@(PT_FORMATVEC),%a0@-/* copy exception format/vector/sr */
+       movel   %sp@(PT_OFF_PC),%a0@-   /* copy exception program counter */
+       movel   %sp@(PT_OFF_FORMATVEC),%a0@-/*copy exception format/vector/sr */
        moveml  %sp@,%d1-%d5/%a0-%a2
        lea     %sp@(32),%sp            /* space for 8 regs */
        movel   %sp@+,%d0
index ddfab96..5e9249b 100644 (file)
@@ -145,16 +145,16 @@ extern unsigned int fp_debugprint;
  * these are only used during instruction decoding
  * where we always know how deep we're on the stack.
  */
-#define FPS_DO         (PT_D0)
-#define FPS_D1         (PT_D1)
-#define FPS_D2         (PT_D2)
-#define FPS_A0         (PT_A0)
-#define FPS_A1         (PT_A1)
-#define FPS_A2         (PT_A2)
-#define FPS_SR         (PT_SR)
-#define FPS_PC         (PT_PC)
-#define FPS_EA         (PT_PC+6)
-#define FPS_PC2                (PT_PC+10)
+#define FPS_DO         (PT_OFF_D0)
+#define FPS_D1         (PT_OFF_D1)
+#define FPS_D2         (PT_OFF_D2)
+#define FPS_A0         (PT_OFF_A0)
+#define FPS_A1         (PT_OFF_A1)
+#define FPS_A2         (PT_OFF_A2)
+#define FPS_SR         (PT_OFF_SR)
+#define FPS_PC         (PT_OFF_PC)
+#define FPS_EA         (PT_OFF_PC+6)
+#define FPS_PC2                (PT_OFF_PC+10)
 
 .macro fp_get_fp_reg
        lea     (FPD_FPREG,FPDATA,%d0.w*4),%a0
index 6ea5c33..b6da388 100644 (file)
@@ -1,6 +1,10 @@
 #ifndef _ASM_M68K_THREAD_INFO_H
 #define _ASM_M68K_THREAD_INFO_H
 
+#ifndef ASM_OFFSETS_C
+#include <asm/asm-offsets.h>
+#endif
+#include <asm/current.h>
 #include <asm/types.h>
 #include <asm/page.h>
 
@@ -31,7 +35,12 @@ struct thread_info {
 #define init_thread_info       (init_task.thread.info)
 #define init_stack             (init_thread_union.stack)
 
-#define task_thread_info(tsk)  (&(tsk)->thread.info)
+#ifdef ASM_OFFSETS_C
+#define task_thread_info(tsk)  ((struct thread_info *) NULL)
+#else
+#define task_thread_info(tsk)  ((struct thread_info *)((char *)tsk+TASK_TINFO))
+#endif
+
 #define task_stack_page(tsk)   ((tsk)->stack)
 #define current_thread_info()  task_thread_info(current)
 
index b1f012f..73e5e58 100644 (file)
@@ -8,6 +8,8 @@
  * #defines from the assembly-language output.
  */
 
+#define ASM_OFFSETS_C
+
 #include <linux/stddef.h>
 #include <linux/sched.h>
 #include <linux/kernel_stat.h>
@@ -27,6 +29,9 @@ int main(void)
        DEFINE(TASK_INFO, offsetof(struct task_struct, thread.info));
        DEFINE(TASK_MM, offsetof(struct task_struct, mm));
        DEFINE(TASK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
+#ifdef CONFIG_MMU
+       DEFINE(TASK_TINFO, offsetof(struct task_struct, thread.info));
+#endif
 
        /* offsets into the thread struct */
        DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp));
@@ -44,20 +49,20 @@ int main(void)
        DEFINE(TINFO_FLAGS, offsetof(struct thread_info, flags));
 
        /* offsets into the pt_regs */
-       DEFINE(PT_D0, offsetof(struct pt_regs, d0));
-       DEFINE(PT_ORIG_D0, offsetof(struct pt_regs, orig_d0));
-       DEFINE(PT_D1, offsetof(struct pt_regs, d1));
-       DEFINE(PT_D2, offsetof(struct pt_regs, d2));
-       DEFINE(PT_D3, offsetof(struct pt_regs, d3));
-       DEFINE(PT_D4, offsetof(struct pt_regs, d4));
-       DEFINE(PT_D5, offsetof(struct pt_regs, d5));
-       DEFINE(PT_A0, offsetof(struct pt_regs, a0));
-       DEFINE(PT_A1, offsetof(struct pt_regs, a1));
-       DEFINE(PT_A2, offsetof(struct pt_regs, a2));
-       DEFINE(PT_PC, offsetof(struct pt_regs, pc));
-       DEFINE(PT_SR, offsetof(struct pt_regs, sr));
+       DEFINE(PT_OFF_D0, offsetof(struct pt_regs, d0));
+       DEFINE(PT_OFF_ORIG_D0, offsetof(struct pt_regs, orig_d0));
+       DEFINE(PT_OFF_D1, offsetof(struct pt_regs, d1));
+       DEFINE(PT_OFF_D2, offsetof(struct pt_regs, d2));
+       DEFINE(PT_OFF_D3, offsetof(struct pt_regs, d3));
+       DEFINE(PT_OFF_D4, offsetof(struct pt_regs, d4));
+       DEFINE(PT_OFF_D5, offsetof(struct pt_regs, d5));
+       DEFINE(PT_OFF_A0, offsetof(struct pt_regs, a0));
+       DEFINE(PT_OFF_A1, offsetof(struct pt_regs, a1));
+       DEFINE(PT_OFF_A2, offsetof(struct pt_regs, a2));
+       DEFINE(PT_OFF_PC, offsetof(struct pt_regs, pc));
+       DEFINE(PT_OFF_SR, offsetof(struct pt_regs, sr));
        /* bitfields are a bit difficult */
-       DEFINE(PT_VECTOR, offsetof(struct pt_regs, pc) + 4);
+       DEFINE(PT_OFF_FORMATVEC, offsetof(struct pt_regs, pc) + 4);
 
        /* offsets into the irq_handler struct */
        DEFINE(IRQ_HANDLER, offsetof(struct irq_node, handler));
@@ -84,10 +89,10 @@ int main(void)
        DEFINE(FONT_DESC_PREF, offsetof(struct font_desc, pref));
 
        /* signal defines */
-       DEFINE(SIGSEGV, SIGSEGV);
-       DEFINE(SEGV_MAPERR, SEGV_MAPERR);
-       DEFINE(SIGTRAP, SIGTRAP);
-       DEFINE(TRAP_TRACE, TRAP_TRACE);
+       DEFINE(LSIGSEGV, SIGSEGV);
+       DEFINE(LSEGV_MAPERR, SEGV_MAPERR);
+       DEFINE(LSIGTRAP, SIGTRAP);
+       DEFINE(LTRAP_TRACE, TRAP_TRACE);
 
        /* offsets into the custom struct */
        DEFINE(CUSTOMBASE, &amiga_custom);
index c3735cd..922f52e 100644 (file)
@@ -77,17 +77,17 @@ ENTRY(ret_from_fork)
        jra     .Lret_from_exception
 
 do_trace_entry:
-       movel   #-ENOSYS,%sp@(PT_D0)    | needed for strace
+       movel   #-ENOSYS,%sp@(PT_OFF_D0)| needed for strace
        subql   #4,%sp
        SAVE_SWITCH_STACK
        jbsr    syscall_trace
        RESTORE_SWITCH_STACK
        addql   #4,%sp
-       movel   %sp@(PT_ORIG_D0),%d0
+       movel   %sp@(PT_OFF_ORIG_D0),%d0
        cmpl    #NR_syscalls,%d0
        jcs     syscall
 badsys:
-       movel   #-ENOSYS,%sp@(PT_D0)
+       movel   #-ENOSYS,%sp@(PT_OFF_D0)
        jra     ret_from_syscall
 
 do_trace_exit:
@@ -103,7 +103,7 @@ ENTRY(ret_from_signal)
        addql   #4,%sp
 /* on 68040 complete pending writebacks if any */
 #ifdef CONFIG_M68040
-       bfextu  %sp@(PT_VECTOR){#0,#4},%d0
+       bfextu  %sp@(PT_OFF_FORMATVEC){#0,#4},%d0
        subql   #7,%d0                          | bus error frame ?
        jbne    1f
        movel   %sp,%sp@-
@@ -127,7 +127,7 @@ ENTRY(system_call)
        jcc     badsys
 syscall:
        jbsr    @(sys_call_table,%d0:l:4)@(0)
-       movel   %d0,%sp@(PT_D0)         | save the return value
+       movel   %d0,%sp@(PT_OFF_D0)     | save the return value
 ret_from_syscall:
        |oriw   #0x0700,%sr
        movew   %curptr@(TASK_INFO+TINFO_FLAGS+2),%d0
@@ -135,7 +135,7 @@ ret_from_syscall:
 1:     RESTORE_ALL
 
 syscall_exit_work:
-       btst    #5,%sp@(PT_SR)          | check if returning to kernel
+       btst    #5,%sp@(PT_OFF_SR)      | check if returning to kernel
        bnes    1b                      | if so, skip resched, signals
        lslw    #1,%d0
        jcs     do_trace_exit
@@ -148,7 +148,7 @@ syscall_exit_work:
 
 ENTRY(ret_from_exception)
 .Lret_from_exception:
-       btst    #5,%sp@(PT_SR)          | check if returning to kernel
+       btst    #5,%sp@(PT_OFF_SR)      | check if returning to kernel
        bnes    1f                      | if so, skip resched, signals
        | only allow interrupts when we are really the last one on the
        | kernel stack, otherwise stack overflow can occur during
@@ -182,7 +182,7 @@ do_signal_return:
        jbra    resume_userspace
 
 do_delayed_trace:
-       bclr    #7,%sp@(PT_SR)          | clear trace bit in SR
+       bclr    #7,%sp@(PT_OFF_SR)      | clear trace bit in SR
        pea     1                       | send SIGTRAP
        movel   %curptr,%sp@-
        pea     LSIGTRAP
@@ -199,7 +199,7 @@ ENTRY(auto_inthandler)
        GET_CURRENT(%d0)
        addqb   #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
                                        |  put exception # in d0
-       bfextu  %sp@(PT_VECTOR){#4,#10},%d0
+       bfextu  %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
        subw    #VEC_SPUR,%d0
 
        movel   %sp,%sp@-
@@ -216,7 +216,7 @@ ret_from_interrupt:
        ALIGN
 ret_from_last_interrupt:
        moveq   #(~ALLOWINT>>8)&0xff,%d0
-       andb    %sp@(PT_SR),%d0
+       andb    %sp@(PT_OFF_SR),%d0
        jne     2b
 
        /* check if we need to do software interrupts */
@@ -232,7 +232,7 @@ ENTRY(user_inthandler)
        GET_CURRENT(%d0)
        addqb   #1,%curptr@(TASK_INFO+TINFO_PREEMPT+1)
                                        |  put exception # in d0
-       bfextu  %sp@(PT_VECTOR){#4,#10},%d0
+       bfextu  %sp@(PT_OFF_FORMATVEC){#4,#10},%d0
 user_irqvec_fixup = . + 2
        subw    #VEC_USER,%d0
 
index 954b4f3..a3fe1f3 100644 (file)
@@ -85,8 +85,8 @@ fp_err_ua2:
 fp_err_ua1:
        addq.l  #4,%sp
        move.l  %a0,-(%sp)
-       pea     SEGV_MAPERR
-       pea     SIGSEGV
+       pea     LSEGV_MAPERR
+       pea     LSIGSEGV
        jsr     fpemu_signal
        add.w   #12,%sp
        jra     ret_from_exception
@@ -96,8 +96,8 @@ fp_err_ua1:
        | it does not really belong here, but...
 fp_sendtrace060:
        move.l  (FPS_PC,%sp),-(%sp)
-       pea     TRAP_TRACE
-       pea     SIGTRAP
+       pea     LTRAP_TRACE
+       pea     LSIGTRAP
        jsr     fpemu_signal
        add.w   #12,%sp
        jra     ret_from_exception
@@ -122,17 +122,17 @@ fp_get_data_reg:
        .long   fp_get_d6, fp_get_d7
 
 fp_get_d0:
-       move.l  (PT_D0+8,%sp),%d0
+       move.l  (PT_OFF_D0+8,%sp),%d0
        printf  PREGISTER,"{d0->%08x}",1,%d0
        rts
 
 fp_get_d1:
-       move.l  (PT_D1+8,%sp),%d0
+       move.l  (PT_OFF_D1+8,%sp),%d0
        printf  PREGISTER,"{d1->%08x}",1,%d0
        rts
 
 fp_get_d2:
-       move.l  (PT_D2+8,%sp),%d0
+       move.l  (PT_OFF_D2+8,%sp),%d0
        printf  PREGISTER,"{d2->%08x}",1,%d0
        rts
 
@@ -173,35 +173,35 @@ fp_put_data_reg:
 
 fp_put_d0:
        printf  PREGISTER,"{d0<-%08x}",1,%d0
-       move.l  %d0,(PT_D0+8,%sp)
+       move.l  %d0,(PT_OFF_D0+8,%sp)
        rts
 
 fp_put_d1:
        printf  PREGISTER,"{d1<-%08x}",1,%d0
-       move.l  %d0,(PT_D1+8,%sp)
+       move.l  %d0,(PT_OFF_D1+8,%sp)
        rts
 
 fp_put_d2:
        printf  PREGISTER,"{d2<-%08x}",1,%d0
-       move.l  %d0,(PT_D2+8,%sp)
+       move.l  %d0,(PT_OFF_D2+8,%sp)
        rts
 
 fp_put_d3:
        printf  PREGISTER,"{d3<-%08x}",1,%d0
 |      move.l  %d0,%d3
-       move.l  %d0,(PT_D3+8,%sp)
+       move.l  %d0,(PT_OFF_D3+8,%sp)
        rts
 
 fp_put_d4:
        printf  PREGISTER,"{d4<-%08x}",1,%d0
 |      move.l  %d0,%d4
-       move.l  %d0,(PT_D4+8,%sp)
+       move.l  %d0,(PT_OFF_D4+8,%sp)
        rts
 
 fp_put_d5:
        printf  PREGISTER,"{d5<-%08x}",1,%d0
 |      move.l  %d0,%d5
-       move.l  %d0,(PT_D5+8,%sp)
+       move.l  %d0,(PT_OFF_D5+8,%sp)
        rts
 
 fp_put_d6:
@@ -225,17 +225,17 @@ fp_get_addr_reg:
        .long   fp_get_a6, fp_get_a7
 
 fp_get_a0:
-       move.l  (PT_A0+8,%sp),%a0
+       move.l  (PT_OFF_A0+8,%sp),%a0
        printf  PREGISTER,"{a0->%08x}",1,%a0
        rts
 
 fp_get_a1:
-       move.l  (PT_A1+8,%sp),%a0
+       move.l  (PT_OFF_A1+8,%sp),%a0
        printf  PREGISTER,"{a1->%08x}",1,%a0
        rts
 
 fp_get_a2:
-       move.l  (PT_A2+8,%sp),%a0
+       move.l  (PT_OFF_A2+8,%sp),%a0
        printf  PREGISTER,"{a2->%08x}",1,%a0
        rts
 
@@ -276,17 +276,17 @@ fp_put_addr_reg:
 
 fp_put_a0:
        printf  PREGISTER,"{a0<-%08x}",1,%a0
-       move.l  %a0,(PT_A0+8,%sp)
+       move.l  %a0,(PT_OFF_A0+8,%sp)
        rts
 
 fp_put_a1:
        printf  PREGISTER,"{a1<-%08x}",1,%a0
-       move.l  %a0,(PT_A1+8,%sp)
+       move.l  %a0,(PT_OFF_A1+8,%sp)
        rts
 
 fp_put_a2:
        printf  PREGISTER,"{a2<-%08x}",1,%a0
-       move.l  %a0,(PT_A2+8,%sp)
+       move.l  %a0,(PT_OFF_A2+8,%sp)
        rts
 
 fp_put_a3:
index b44aaab..0c34371 100644 (file)
@@ -424,6 +424,29 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 #endif
 }
 
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+       struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+       if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size))
+               return 0;
+
+       if (!dev->dma_mask)
+               return 0;
+
+       return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+       return paddr + get_dma_direct_offset(dev);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+       return daddr - get_dma_direct_offset(dev);
+}
+
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #ifdef CONFIG_NOT_COHERENT_CACHE
index eb17da7..2a5da06 100644 (file)
@@ -104,8 +104,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
        else
                pte_update(ptep, ~_PAGE_HASHPTE, pte_val(pte));
 
-#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) && defined(CONFIG_SMP)
-       /* Second case is 32-bit with 64-bit PTE in SMP mode. In this case, we
+#elif defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
+       /* Second case is 32-bit with 64-bit PTE.  In this case, we
         * can just store as long as we do the two halves in the right order
         * with a barrier in between. This is possible because we take care,
         * in the hash code, to pre-invalidate if the PTE was already hashed,
@@ -140,7 +140,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 
 #else
        /* Anything else just stores the PTE normally. That covers all 64-bit
-        * cases, and 32-bit non-hash with 64-bit PTEs in UP mode
+        * cases, and 32-bit non-hash with 32-bit PTEs.
         */
        *ptep = pte;
 #endif
index c3b1931..198266c 100644 (file)
@@ -54,7 +54,7 @@
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
  */
-static inline unsigned long __spin_trylock(raw_spinlock_t *lock)
+static inline unsigned long arch_spin_trylock(raw_spinlock_t *lock)
 {
        unsigned long tmp, token;
 
@@ -76,7 +76,7 @@ static inline unsigned long __spin_trylock(raw_spinlock_t *lock)
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
        CLEAR_IO_SYNC;
-       return __spin_trylock(lock) == 0;
+       return arch_spin_trylock(lock) == 0;
 }
 
 /*
@@ -108,7 +108,7 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
        CLEAR_IO_SYNC;
        while (1) {
-               if (likely(__spin_trylock(lock) == 0))
+               if (likely(arch_spin_trylock(lock) == 0))
                        break;
                do {
                        HMT_low();
@@ -126,7 +126,7 @@ void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 
        CLEAR_IO_SYNC;
        while (1) {
-               if (likely(__spin_trylock(lock) == 0))
+               if (likely(arch_spin_trylock(lock) == 0))
                        break;
                local_save_flags(flags_dis);
                local_irq_restore(flags);
@@ -181,7 +181,7 @@ extern void __raw_spin_unlock_wait(raw_spinlock_t *lock);
  * This returns the old value in the lock + 1,
  * so we got a read lock if the return value is > 0.
  */
-static inline long __read_trylock(raw_rwlock_t *rw)
+static inline long arch_read_trylock(raw_rwlock_t *rw)
 {
        long tmp;
 
@@ -205,7 +205,7 @@ static inline long __read_trylock(raw_rwlock_t *rw)
  * This returns the old value in the lock,
  * so we got the write lock if the return value is 0.
  */
-static inline long __write_trylock(raw_rwlock_t *rw)
+static inline long arch_write_trylock(raw_rwlock_t *rw)
 {
        long tmp, token;
 
@@ -228,7 +228,7 @@ static inline long __write_trylock(raw_rwlock_t *rw)
 static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
        while (1) {
-               if (likely(__read_trylock(rw) > 0))
+               if (likely(arch_read_trylock(rw) > 0))
                        break;
                do {
                        HMT_low();
@@ -242,7 +242,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
 static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
        while (1) {
-               if (likely(__write_trylock(rw) == 0))
+               if (likely(arch_write_trylock(rw) == 0))
                        break;
                do {
                        HMT_low();
@@ -255,12 +255,12 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
 
 static inline int __raw_read_trylock(raw_rwlock_t *rw)
 {
-       return __read_trylock(rw) > 0;
+       return arch_read_trylock(rw) > 0;
 }
 
 static inline int __raw_write_trylock(raw_rwlock_t *rw)
 {
-       return __write_trylock(rw) == 0;
+       return arch_write_trylock(rw) == 0;
 }
 
 static inline void __raw_read_unlock(raw_rwlock_t *rw)
index b73396b..9619285 100644 (file)
@@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT)         += compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
-obj-$(CONFIG_PPC_PERF_CTRS)    += perf_counter.o
+obj-$(CONFIG_PPC_PERF_CTRS)    += perf_counter.o perf_callchain.o
 obj64-$(CONFIG_PPC_PERF_CTRS)  += power4-pmu.o ppc970-pmu.o power5-pmu.o \
                                   power5+-pmu.o power6-pmu.o power7-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)  += mpc7450-pmu.o
index 561b646..197b156 100644 (file)
@@ -67,6 +67,8 @@ int main(void)
        DEFINE(MMCONTEXTID, offsetof(struct mm_struct, context.id));
 #ifdef CONFIG_PPC64
        DEFINE(AUDITCONTEXT, offsetof(struct task_struct, audit_context));
+       DEFINE(SIGSEGV, SIGSEGV);
+       DEFINE(NMI_MASK, NMI_MASK);
 #else
        DEFINE(THREAD_INFO, offsetof(struct task_struct, stack));
 #endif /* CONFIG_PPC64 */
index 68ccf11..e8a57de 100644 (file)
 int swiotlb __read_mostly;
 unsigned int ppc_swiotlb_enable;
 
-void *swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t addr)
-{
-       unsigned long pfn = PFN_DOWN(swiotlb_bus_to_phys(hwdev, addr));
-       void *pageaddr = page_address(pfn_to_page(pfn));
-
-       if (pageaddr != NULL)
-               return pageaddr + (addr % PAGE_SIZE);
-       return NULL;
-}
-
-dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
-{
-       return paddr + get_dma_direct_offset(hwdev);
-}
-
-phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
-
-{
-       return baddr - get_dma_direct_offset(hwdev);
-}
-
-/*
- * Determine if an address needs bounce buffering via swiotlb.
- * Going forward I expect the swiotlb code to generalize on using
- * a dma_ops->addr_needs_map, and this function will move from here to the
- * generic swiotlb code.
- */
-int
-swiotlb_arch_address_needs_mapping(struct device *hwdev, dma_addr_t addr,
-                                  size_t size)
-{
-       struct dma_mapping_ops *dma_ops = get_dma_ops(hwdev);
-
-       BUG_ON(!dma_ops);
-       return dma_ops->addr_needs_map(hwdev, addr, size);
-}
-
 /*
  * Determine if an address is reachable by a pci device, or if we must bounce.
  */
 static int
 swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
 {
-       u64 mask = dma_get_mask(hwdev);
        dma_addr_t max;
        struct pci_controller *hose;
        struct pci_dev *pdev = to_pci_dev(hwdev);
@@ -79,16 +41,9 @@ swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
        if ((addr + size > max) | (addr < hose->dma_window_base_cur))
                return 1;
 
-       return !is_buffer_dma_capable(mask, addr, size);
-}
-
-static int
-swiotlb_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size)
-{
-       return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+       return 0;
 }
 
-
 /*
  * At the moment, all platforms that use this code only require
  * swiotlb to be used if we're operating on HIGHMEM.  Since
@@ -104,7 +59,6 @@ struct dma_mapping_ops swiotlb_dma_ops = {
        .dma_supported = swiotlb_dma_supported,
        .map_page = swiotlb_map_page,
        .unmap_page = swiotlb_unmap_page,
-       .addr_needs_map = swiotlb_addr_needs_map,
        .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
        .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
        .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
index eb89811..8ac85e0 100644 (file)
@@ -729,6 +729,11 @@ BEGIN_FTR_SECTION
        bne-    do_ste_alloc            /* If so handle it */
 END_FTR_SECTION_IFCLR(CPU_FTR_SLB)
 
+       clrrdi  r11,r1,THREAD_SHIFT
+       lwz     r0,TI_PREEMPT(r11)      /* If we're in an "NMI" */
+       andis.  r0,r0,NMI_MASK@h        /* (i.e. an irq when soft-disabled) */
+       bne     77f                     /* then don't call hash_page now */
+
        /*
         * On iSeries, we soft-disable interrupts here, then
         * hard-enable interrupts so that the hash_page code can spin on
@@ -833,6 +838,20 @@ handle_page_fault:
        bl      .low_hash_fault
        b       .ret_from_except
 
+/*
+ * We come here as a result of a DSI at a point where we don't want
+ * to call hash_page, such as when we are accessing memory (possibly
+ * user memory) inside a PMU interrupt that occurred while interrupts
+ * were soft-disabled.  We want to invoke the exception handler for
+ * the access, or panic if there isn't a handler.
+ */
+77:    bl      .save_nvgprs
+       mr      r4,r3
+       addi    r3,r1,STACK_FRAME_OVERHEAD
+       li      r5,SIGSEGV
+       bl      .bad_page_fault
+       b       .ret_from_except
+
        /* here we have a segment miss */
 do_ste_alloc:
        bl      .ste_allocate           /* try to insert stab entry */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
new file mode 100644 (file)
index 0000000..f74b62c
--- /dev/null
@@ -0,0 +1,527 @@
+/*
+ * Performance counter callchain support - powerpc architecture code
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/vdso.h>
+#ifdef CONFIG_PPC64
+#include "ppc32.h"
+#endif
+
+/*
+ * Store another value in a callchain_entry.
+ */
+static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+       unsigned int nr = entry->nr;
+
+       if (nr < PERF_MAX_STACK_DEPTH) {
+               entry->ip[nr] = ip;
+               entry->nr = nr + 1;
+       }
+}
+
+/*
+ * Is sp valid as the address of the next kernel stack frame after prev_sp?
+ * The next frame may be in a different stack area but should not go
+ * back down in the same stack area.
+ */
+static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
+{
+       if (sp & 0xf)
+               return 0;               /* must be 16-byte aligned */
+       if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+               return 0;
+       if (sp >= prev_sp + STACK_FRAME_OVERHEAD)
+               return 1;
+       /*
+        * sp could decrease when we jump off an interrupt stack
+        * back to the regular process stack.
+        */
+       if ((sp & ~(THREAD_SIZE - 1)) != (prev_sp & ~(THREAD_SIZE - 1)))
+               return 1;
+       return 0;
+}
+
+static void perf_callchain_kernel(struct pt_regs *regs,
+                                 struct perf_callchain_entry *entry)
+{
+       unsigned long sp, next_sp;
+       unsigned long next_ip;
+       unsigned long lr;
+       long level = 0;
+       unsigned long *fp;
+
+       lr = regs->link;
+       sp = regs->gpr[1];
+       callchain_store(entry, PERF_CONTEXT_KERNEL);
+       callchain_store(entry, regs->nip);
+
+       if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
+               return;
+
+       for (;;) {
+               fp = (unsigned long *) sp;
+               next_sp = fp[0];
+
+               if (next_sp == sp + STACK_INT_FRAME_SIZE &&
+                   fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+                       /*
+                        * This looks like an interrupt frame for an
+                        * interrupt that occurred in the kernel
+                        */
+                       regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD);
+                       next_ip = regs->nip;
+                       lr = regs->link;
+                       level = 0;
+                       callchain_store(entry, PERF_CONTEXT_KERNEL);
+
+               } else {
+                       if (level == 0)
+                               next_ip = lr;
+                       else
+                               next_ip = fp[STACK_FRAME_LR_SAVE];
+
+                       /*
+                        * We can't tell which of the first two addresses
+                        * we get are valid, but we can filter out the
+                        * obviously bogus ones here.  We replace them
+                        * with 0 rather than removing them entirely so
+                        * that userspace can tell which is which.
+                        */
+                       if ((level == 1 && next_ip == lr) ||
+                           (level <= 1 && !kernel_text_address(next_ip)))
+                               next_ip = 0;
+
+                       ++level;
+               }
+
+               callchain_store(entry, next_ip);
+               if (!valid_next_sp(next_sp, sp))
+                       return;
+               sp = next_sp;
+       }
+}
+
+#ifdef CONFIG_PPC64
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define is_huge_psize(pagesize)        (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
+#else
+#define is_huge_psize(pagesize)        0
+#endif
+
+/*
+ * On 64-bit we don't want to invoke hash_page on user addresses from
+ * interrupt context, so if the access faults, we read the page tables
+ * to find which page (if any) is mapped and access it directly.
+ */
+static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+{
+       pgd_t *pgdir;
+       pte_t *ptep, pte;
+       int pagesize;
+       unsigned long addr = (unsigned long) ptr;
+       unsigned long offset;
+       unsigned long pfn;
+       void *kaddr;
+
+       pgdir = current->mm->pgd;
+       if (!pgdir)
+               return -EFAULT;
+
+       pagesize = get_slice_psize(current->mm, addr);
+
+       /* align address to page boundary */
+       offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+       addr -= offset;
+
+       if (is_huge_psize(pagesize))
+               ptep = huge_pte_offset(current->mm, addr);
+       else
+               ptep = find_linux_pte(pgdir, addr);
+
+       if (ptep == NULL)
+               return -EFAULT;
+       pte = *ptep;
+       if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
+               return -EFAULT;
+       pfn = pte_pfn(pte);
+       if (!page_is_ram(pfn))
+               return -EFAULT;
+
+       /* no highmem to worry about here */
+       kaddr = pfn_to_kaddr(pfn);
+       memcpy(ret, kaddr + offset, nb);
+       return 0;
+}
+
+static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+{
+       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
+           ((unsigned long)ptr & 7))
+               return -EFAULT;
+
+       if (!__get_user_inatomic(*ret, ptr))
+               return 0;
+
+       return read_user_stack_slow(ptr, ret, 8);
+}
+
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+           ((unsigned long)ptr & 3))
+               return -EFAULT;
+
+       if (!__get_user_inatomic(*ret, ptr))
+               return 0;
+
+       return read_user_stack_slow(ptr, ret, 4);
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+       if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x100000000UL) - 32)
+               return 0;
+       return 1;
+}
+
+/*
+ * 64-bit user processes use the same stack frame for RT and non-RT signals.
+ */
+struct signal_frame_64 {
+       char            dummy[__SIGNAL_FRAMESIZE];
+       struct ucontext uc;
+       unsigned long   unused[2];
+       unsigned int    tramp[6];
+       struct siginfo  *pinfo;
+       void            *puc;
+       struct siginfo  info;
+       char            abigap[288];
+};
+
+static int is_sigreturn_64_address(unsigned long nip, unsigned long fp)
+{
+       if (nip == fp + offsetof(struct signal_frame_64, tramp))
+               return 1;
+       if (vdso64_rt_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso64_rt_sigtramp)
+               return 1;
+       return 0;
+}
+
+/*
+ * Do some sanity checking on the signal frame pointed to by sp.
+ * We check the pinfo and puc pointers in the frame.
+ */
+static int sane_signal_64_frame(unsigned long sp)
+{
+       struct signal_frame_64 __user *sf;
+       unsigned long pinfo, puc;
+
+       sf = (struct signal_frame_64 __user *) sp;
+       if (read_user_stack_64((unsigned long __user *) &sf->pinfo, &pinfo) ||
+           read_user_stack_64((unsigned long __user *) &sf->puc, &puc))
+               return 0;
+       return pinfo == (unsigned long) &sf->info &&
+               puc == (unsigned long) &sf->uc;
+}
+
+static void perf_callchain_user_64(struct pt_regs *regs,
+                                  struct perf_callchain_entry *entry)
+{
+       unsigned long sp, next_sp;
+       unsigned long next_ip;
+       unsigned long lr;
+       long level = 0;
+       struct signal_frame_64 __user *sigframe;
+       unsigned long __user *fp, *uregs;
+
+       next_ip = regs->nip;
+       lr = regs->link;
+       sp = regs->gpr[1];
+       callchain_store(entry, PERF_CONTEXT_USER);
+       callchain_store(entry, next_ip);
+
+       for (;;) {
+               fp = (unsigned long __user *) sp;
+               if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
+                       return;
+               if (level > 0 && read_user_stack_64(&fp[2], &next_ip))
+                       return;
+
+               /*
+                * Note: the next_sp - sp >= signal frame size check
+                * is true when next_sp < sp, which can happen when
+                * transitioning from an alternate signal stack to the
+                * normal stack.
+                */
+               if (next_sp - sp >= sizeof(struct signal_frame_64) &&
+                   (is_sigreturn_64_address(next_ip, sp) ||
+                    (level <= 1 && is_sigreturn_64_address(lr, sp))) &&
+                   sane_signal_64_frame(sp)) {
+                       /*
+                        * This looks like an signal frame
+                        */
+                       sigframe = (struct signal_frame_64 __user *) sp;
+                       uregs = sigframe->uc.uc_mcontext.gp_regs;
+                       if (read_user_stack_64(&uregs[PT_NIP], &next_ip) ||
+                           read_user_stack_64(&uregs[PT_LNK], &lr) ||
+                           read_user_stack_64(&uregs[PT_R1], &sp))
+                               return;
+                       level = 0;
+                       callchain_store(entry, PERF_CONTEXT_USER);
+                       callchain_store(entry, next_ip);
+                       continue;
+               }
+
+               if (level == 0)
+                       next_ip = lr;
+               callchain_store(entry, next_ip);
+               ++level;
+               sp = next_sp;
+       }
+}
+
+static inline int current_is_64bit(void)
+{
+       /*
+        * We can't use test_thread_flag() here because we may be on an
+        * interrupt stack, and the thread flags don't get copied over
+        * from the thread_info on the main stack to the interrupt stack.
+        */
+       return !test_ti_thread_flag(task_thread_info(current), TIF_32BIT);
+}
+
+#else  /* CONFIG_PPC64 */
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables.  Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+{
+       if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
+           ((unsigned long)ptr & 3))
+               return -EFAULT;
+
+       return __get_user_inatomic(*ret, ptr);
+}
+
+static inline void perf_callchain_user_64(struct pt_regs *regs,
+                                         struct perf_callchain_entry *entry)
+{
+}
+
+static inline int current_is_64bit(void)
+{
+       return 0;
+}
+
+static inline int valid_user_sp(unsigned long sp, int is_64)
+{
+       if (!sp || (sp & 7) || sp > TASK_SIZE - 32)
+               return 0;
+       return 1;
+}
+
+#define __SIGNAL_FRAMESIZE32   __SIGNAL_FRAMESIZE
+#define sigcontext32           sigcontext
+#define mcontext32             mcontext
+#define ucontext32             ucontext
+#define compat_siginfo_t       struct siginfo
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Layout for non-RT signal frames
+ */
+struct signal_frame_32 {
+       char                    dummy[__SIGNAL_FRAMESIZE32];
+       struct sigcontext32     sctx;
+       struct mcontext32       mctx;
+       int                     abigap[56];
+};
+
+/*
+ * Layout for RT signal frames
+ */
+struct rt_signal_frame_32 {
+       char                    dummy[__SIGNAL_FRAMESIZE32 + 16];
+       compat_siginfo_t        info;
+       struct ucontext32       uc;
+       int                     abigap[56];
+};
+
+static int is_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+       if (nip == fp + offsetof(struct signal_frame_32, mctx.mc_pad))
+               return 1;
+       if (vdso32_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso32_sigtramp)
+               return 1;
+       return 0;
+}
+
+static int is_rt_sigreturn_32_address(unsigned int nip, unsigned int fp)
+{
+       if (nip == fp + offsetof(struct rt_signal_frame_32,
+                                uc.uc_mcontext.mc_pad))
+               return 1;
+       if (vdso32_rt_sigtramp && current->mm->context.vdso_base &&
+           nip == current->mm->context.vdso_base + vdso32_rt_sigtramp)
+               return 1;
+       return 0;
+}
+
+static int sane_signal_32_frame(unsigned int sp)
+{
+       struct signal_frame_32 __user *sf;
+       unsigned int regs;
+
+       sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+       if (read_user_stack_32((unsigned int __user *) &sf->sctx.regs, &regs))
+               return 0;
+       return regs == (unsigned long) &sf->mctx;
+}
+
+static int sane_rt_signal_32_frame(unsigned int sp)
+{
+       struct rt_signal_frame_32 __user *sf;
+       unsigned int regs;
+
+       sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+       if (read_user_stack_32((unsigned int __user *) &sf->uc.uc_regs, &regs))
+               return 0;
+       return regs == (unsigned long) &sf->uc.uc_mcontext;
+}
+
+static unsigned int __user *signal_frame_32_regs(unsigned int sp,
+                               unsigned int next_sp, unsigned int next_ip)
+{
+       struct mcontext32 __user *mctx = NULL;
+       struct signal_frame_32 __user *sf;
+       struct rt_signal_frame_32 __user *rt_sf;
+
+       /*
+        * Note: the next_sp - sp >= signal frame size check
+        * is true when next_sp < sp, for example, when
+        * transitioning from an alternate signal stack to the
+        * normal stack.
+        */
+       if (next_sp - sp >= sizeof(struct signal_frame_32) &&
+           is_sigreturn_32_address(next_ip, sp) &&
+           sane_signal_32_frame(sp)) {
+               sf = (struct signal_frame_32 __user *) (unsigned long) sp;
+               mctx = &sf->mctx;
+       }
+
+       if (!mctx && next_sp - sp >= sizeof(struct rt_signal_frame_32) &&
+           is_rt_sigreturn_32_address(next_ip, sp) &&
+           sane_rt_signal_32_frame(sp)) {
+               rt_sf = (struct rt_signal_frame_32 __user *) (unsigned long) sp;
+               mctx = &rt_sf->uc.uc_mcontext;
+       }
+
+       if (!mctx)
+               return NULL;
+       return mctx->mc_gregs;
+}
+
+static void perf_callchain_user_32(struct pt_regs *regs,
+                                  struct perf_callchain_entry *entry)
+{
+       unsigned int sp, next_sp;
+       unsigned int next_ip;
+       unsigned int lr;
+       long level = 0;
+       unsigned int __user *fp, *uregs;
+
+       next_ip = regs->nip;
+       lr = regs->link;
+       sp = regs->gpr[1];
+       callchain_store(entry, PERF_CONTEXT_USER);
+       callchain_store(entry, next_ip);
+
+       while (entry->nr < PERF_MAX_STACK_DEPTH) {
+               fp = (unsigned int __user *) (unsigned long) sp;
+               if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
+                       return;
+               if (level > 0 && read_user_stack_32(&fp[1], &next_ip))
+                       return;
+
+               uregs = signal_frame_32_regs(sp, next_sp, next_ip);
+               if (!uregs && level <= 1)
+                       uregs = signal_frame_32_regs(sp, next_sp, lr);
+               if (uregs) {
+                       /*
+                        * This looks like an signal frame, so restart
+                        * the stack trace with the values in it.
+                        */
+                       if (read_user_stack_32(&uregs[PT_NIP], &next_ip) ||
+                           read_user_stack_32(&uregs[PT_LNK], &lr) ||
+                           read_user_stack_32(&uregs[PT_R1], &sp))
+                               return;
+                       level = 0;
+                       callchain_store(entry, PERF_CONTEXT_USER);
+                       callchain_store(entry, next_ip);
+                       continue;
+               }
+
+               if (level == 0)
+                       next_ip = lr;
+               callchain_store(entry, next_ip);
+               ++level;
+               sp = next_sp;
+       }
+}
+
+/*
+ * Since we can't get PMU interrupts inside a PMU interrupt handler,
+ * we don't need separate irq and nmi entries here.
+ */
+static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
+
+       entry->nr = 0;
+
+       if (current->pid == 0)          /* idle task? */
+               return entry;
+
+       if (!user_mode(regs)) {
+               perf_callchain_kernel(regs, entry);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               if (current_is_64bit())
+                       perf_callchain_user_64(regs, entry);
+               else
+                       perf_callchain_user_32(regs, entry);
+       }
+
+       return entry;
+}
index 5b7038f..a685652 100644 (file)
@@ -92,15 +92,13 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
                     : "memory" );
 }
 
-void slb_flush_and_rebolt(void)
+static void __slb_flush_and_rebolt(void)
 {
        /* If you change this make sure you change SLB_NUM_BOLTED
         * appropriately too. */
        unsigned long linear_llp, vmalloc_llp, lflags, vflags;
        unsigned long ksp_esid_data, ksp_vsid_data;
 
-       WARN_ON(!irqs_disabled());
-
        linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
        vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
        lflags = SLB_VSID_KERNEL | linear_llp;
@@ -117,12 +115,6 @@ void slb_flush_and_rebolt(void)
                ksp_vsid_data = get_slb_shadow()->save_area[2].vsid;
        }
 
-       /*
-        * We can't take a PMU exception in the following code, so hard
-        * disable interrupts.
-        */
-       hard_irq_disable();
-
        /* We need to do this all in asm, so we're sure we don't touch
         * the stack between the slbia and rebolting it. */
        asm volatile("isync\n"
@@ -139,6 +131,21 @@ void slb_flush_and_rebolt(void)
                     : "memory");
 }
 
+void slb_flush_and_rebolt(void)
+{
+
+       WARN_ON(!irqs_disabled());
+
+       /*
+        * We can't take a PMU exception in the following code, so hard
+        * disable interrupts.
+        */
+       hard_irq_disable();
+
+       __slb_flush_and_rebolt();
+       get_paca()->slb_cache_ptr = 0;
+}
+
 void slb_vmalloc_update(void)
 {
        unsigned long vflags;
@@ -180,12 +187,20 @@ static inline int esids_match(unsigned long addr1, unsigned long addr2)
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-       unsigned long offset = get_paca()->slb_cache_ptr;
+       unsigned long offset;
        unsigned long slbie_data = 0;
        unsigned long pc = KSTK_EIP(tsk);
        unsigned long stack = KSTK_ESP(tsk);
        unsigned long unmapped_base;
 
+       /*
+        * We need interrupts hard-disabled here, not just soft-disabled,
+        * so that a PMU interrupt can't occur, which might try to access
+        * user memory (to get a stack trace) and possible cause an SLB miss
+        * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+        */
+       hard_irq_disable();
+       offset = get_paca()->slb_cache_ptr;
        if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) &&
            offset <= SLB_CACHE_ENTRIES) {
                int i;
@@ -200,7 +215,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
                }
                asm volatile("isync" : : : "memory");
        } else {
-               slb_flush_and_rebolt();
+               __slb_flush_and_rebolt();
        }
 
        /* Workaround POWER5 < DD2.1 issue */
index 98cd1dc..ab5fb48 100644 (file)
@@ -164,7 +164,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
 {
        struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
        struct stab_entry *ste;
-       unsigned long offset = __get_cpu_var(stab_cache_ptr);
+       unsigned long offset;
        unsigned long pc = KSTK_EIP(tsk);
        unsigned long stack = KSTK_ESP(tsk);
        unsigned long unmapped_base;
@@ -172,6 +172,15 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm)
        /* Force previous translations to complete. DRENG */
        asm volatile("isync" : : : "memory");
 
+       /*
+        * We need interrupts hard-disabled here, not just soft-disabled,
+        * so that a PMU interrupt can't occur, which might try to access
+        * user memory (to get a stack trace) and possible cause an STAB miss
+        * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+        */
+       hard_irq_disable();
+
+       offset = __get_cpu_var(stab_cache_ptr);
        if (offset <= NR_STAB_CACHE_ENTRIES) {
                int i;
 
index e030e86..1c866ef 100644 (file)
@@ -84,7 +84,7 @@ config S390
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_TRACE_MCOUNT_TEST
        select HAVE_FTRACE_MCOUNT_RECORD
-       select HAVE_FTRACE_SYSCALLS
+       select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_DEFAULT_NO_SPIN_MUTEXES
index fcba206..4e91a25 100644 (file)
@@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
 CONFIG_TRACING_SUPPORT=y
 CONFIG_FTRACE=y
 # CONFIG_FUNCTION_TRACER is not set
index c9af0d1..41ce686 100644 (file)
@@ -191,4 +191,33 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 #define _raw_read_relax(lock)  cpu_relax()
 #define _raw_write_relax(lock) cpu_relax()
 
+#define __always_inline__spin_lock
+#define __always_inline__read_lock
+#define __always_inline__write_lock
+#define __always_inline__spin_lock_bh
+#define __always_inline__read_lock_bh
+#define __always_inline__write_lock_bh
+#define __always_inline__spin_lock_irq
+#define __always_inline__read_lock_irq
+#define __always_inline__write_lock_irq
+#define __always_inline__spin_lock_irqsave
+#define __always_inline__read_lock_irqsave
+#define __always_inline__write_lock_irqsave
+#define __always_inline__spin_trylock
+#define __always_inline__read_trylock
+#define __always_inline__write_trylock
+#define __always_inline__spin_trylock_bh
+#define __always_inline__spin_unlock
+#define __always_inline__read_unlock
+#define __always_inline__write_unlock
+#define __always_inline__spin_unlock_bh
+#define __always_inline__read_unlock_bh
+#define __always_inline__write_unlock_bh
+#define __always_inline__spin_unlock_irq
+#define __always_inline__read_unlock_irq
+#define __always_inline__write_unlock_irq
+#define __always_inline__spin_unlock_irqrestore
+#define __always_inline__read_unlock_irqrestore
+#define __always_inline__write_unlock_irqrestore
+
 #endif /* __ASM_SPINLOCK_H */
index ba1cab9..07eb61b 100644 (file)
@@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_TRACE      8       /* syscall trace active */
 #define TIF_SYSCALL_AUDIT      9       /* syscall auditing active */
 #define TIF_SECCOMP            10      /* secure computing */
-#define TIF_SYSCALL_FTRACE     11      /* ftrace syscall instrumentation */
+#define TIF_SYSCALL_TRACEPOINT 11      /* syscall tracepoint instrumentation */
 #define TIF_USEDFPU            16      /* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG     17      /* true if poll_idle() is polling 
                                           TIF_NEED_RESCHED */
@@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT     (1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP           (1<<TIF_SECCOMP)
-#define _TIF_SYSCALL_FTRACE    (1<<TIF_SYSCALL_FTRACE)
+#define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_USEDFPU           (1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG    (1<<TIF_POLLING_NRFLAG)
 #define _TIF_31BIT             (1<<TIF_31BIT)
index f78580a..f43d2ee 100644 (file)
@@ -54,7 +54,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
 _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
                 _TIF_MCCK_PENDING)
 _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
-               _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8)
+               _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
 
 STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER
 STACK_SIZE  = 1 << STACK_SHIFT
index 009ca61..a6f7b20 100644 (file)
@@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
 _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
                 _TIF_MCCK_PENDING)
 _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \
-               _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8)
+               _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8)
 
 #define BASED(name) name-system_call(%r13)
 
index 3e298e6..57bdcb1 100644 (file)
@@ -220,6 +220,29 @@ struct syscall_metadata *syscall_nr_to_meta(int nr)
        return syscalls_metadata[nr];
 }
 
+int syscall_name_to_nr(char *name)
+{
+       int i;
+
+       if (!syscalls_metadata)
+               return -1;
+       for (i = 0; i < NR_syscalls; i++)
+               if (syscalls_metadata[i])
+                       if (!strcmp(syscalls_metadata[i]->name, name))
+                               return i;
+       return -1;
+}
+
+void set_syscall_enter_id(int num, int id)
+{
+       syscalls_metadata[num]->enter_id = id;
+}
+
+void set_syscall_exit_id(int num, int id)
+{
+       syscalls_metadata[num]->exit_id = id;
+}
+
 static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
 {
        struct syscall_metadata *start;
@@ -237,24 +260,19 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
        return NULL;
 }
 
-void arch_init_ftrace_syscalls(void)
+static int __init arch_init_ftrace_syscalls(void)
 {
        struct syscall_metadata *meta;
        int i;
-       static atomic_t refs;
-
-       if (atomic_inc_return(&refs) != 1)
-               goto out;
        syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * NR_syscalls,
                                    GFP_KERNEL);
        if (!syscalls_metadata)
-               goto out;
+               return -ENOMEM;
        for (i = 0; i < NR_syscalls; i++) {
                meta = find_syscall_meta((unsigned long)sys_call_table[i]);
                syscalls_metadata[i] = meta;
        }
-       return;
-out:
-       atomic_dec(&refs);
+       return 0;
 }
+arch_initcall(arch_init_ftrace_syscalls);
 #endif
index 43acd73..f3ddd7a 100644 (file)
@@ -51,6 +51,9 @@
 #include "compat_ptrace.h"
 #endif
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
 enum s390_regset {
        REGSET_GENERAL,
        REGSET_FP,
@@ -661,8 +664,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
                ret = -1;
        }
 
-       if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
-               ftrace_syscall_enter(regs);
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_enter(regs, regs->gprs[2]);
 
        if (unlikely(current->audit_context))
                audit_syscall_entry(is_compat_task() ?
@@ -679,8 +682,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
                audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
                                   regs->gprs[2]);
 
-       if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE)))
-               ftrace_syscall_exit(regs);
+       if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+               trace_sys_exit(regs, regs->gprs[2]);
 
        if (test_thread_flag(TIF_SYSCALL_TRACE))
                tracehook_report_syscall_exit(regs, 0);
index 3f8b6a9..233cff5 100644 (file)
@@ -25,6 +25,8 @@ config SPARC
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select RTC_CLASS
        select RTC_DRV_M48T59
+       select HAVE_DMA_ATTRS
+       select HAVE_DMA_API_DEBUG
 
 config SPARC32
        def_bool !64BIT
index 204e4bf..5a8c308 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/mm.h>
+#include <linux/dma-debug.h>
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
@@ -13,142 +14,40 @@ extern int dma_set_mask(struct device *dev, u64 dma_mask);
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 #define dma_is_consistent(d, h)        (1)
 
-struct dma_ops {
-       void *(*alloc_coherent)(struct device *dev, size_t size,
-                               dma_addr_t *dma_handle, gfp_t flag);
-       void (*free_coherent)(struct device *dev, size_t size,
-                             void *cpu_addr, dma_addr_t dma_handle);
-       dma_addr_t (*map_page)(struct device *dev, struct page *page,
-                              unsigned long offset, size_t size,
-                              enum dma_data_direction direction);
-       void (*unmap_page)(struct device *dev, dma_addr_t dma_addr,
-                          size_t size,
-                          enum dma_data_direction direction);
-       int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
-                     enum dma_data_direction direction);
-       void (*unmap_sg)(struct device *dev, struct scatterlist *sg,
-                        int nhwentries,
-                        enum dma_data_direction direction);
-       void (*sync_single_for_cpu)(struct device *dev,
-                                   dma_addr_t dma_handle, size_t size,
-                                   enum dma_data_direction direction);
-       void (*sync_single_for_device)(struct device *dev,
-                                      dma_addr_t dma_handle, size_t size,
-                                      enum dma_data_direction direction);
-       void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
-                               int nelems,
-                               enum dma_data_direction direction);
-       void (*sync_sg_for_device)(struct device *dev,
-                                  struct scatterlist *sg, int nents,
-                                  enum dma_data_direction dir);
-};
-extern const struct dma_ops *dma_ops;
+extern struct dma_map_ops *dma_ops, pci32_dma_ops;
+extern struct bus_type pci_bus_type;
 
-static inline void *dma_alloc_coherent(struct device *dev, size_t size,
-                                      dma_addr_t *dma_handle, gfp_t flag)
-{
-       return dma_ops->alloc_coherent(dev, size, dma_handle, flag);
-}
-
-static inline void dma_free_coherent(struct device *dev, size_t size,
-                                    void *cpu_addr, dma_addr_t dma_handle)
-{
-       dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
-static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
-                                       size_t size,
-                                       enum dma_data_direction direction)
-{
-       return dma_ops->map_page(dev, virt_to_page(cpu_addr),
-                                (unsigned long)cpu_addr & ~PAGE_MASK, size,
-                                direction);
-}
-
-static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr,
-                                   size_t size,
-                                   enum dma_data_direction direction)
-{
-       dma_ops->unmap_page(dev, dma_addr, size, direction);
-}
-
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
-                                     unsigned long offset, size_t size,
-                                     enum dma_data_direction direction)
-{
-       return dma_ops->map_page(dev, page, offset, size, direction);
-}
-
-static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address,
-                                 size_t size,
-                                 enum dma_data_direction direction)
-{
-       dma_ops->unmap_page(dev, dma_address, size, direction);
-}
-
-static inline int dma_map_sg(struct device *dev, struct scatterlist *sg,
-                            int nents, enum dma_data_direction direction)
-{
-       return dma_ops->map_sg(dev, sg, nents, direction);
-}
-
-static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-                               int nents, enum dma_data_direction direction)
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
-       dma_ops->unmap_sg(dev, sg, nents, direction);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev,
-                                          dma_addr_t dma_handle, size_t size,
-                                          enum dma_data_direction direction)
-{
-       dma_ops->sync_single_for_cpu(dev, dma_handle, size, direction);
+#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
+       if (dev->bus == &pci_bus_type)
+               return &pci32_dma_ops;
+#endif
+       return dma_ops;
 }
 
-static inline void dma_sync_single_for_device(struct device *dev,
-                                             dma_addr_t dma_handle,
-                                             size_t size,
-                                             enum dma_data_direction direction)
-{
-       if (dma_ops->sync_single_for_device)
-               dma_ops->sync_single_for_device(dev, dma_handle, size,
-                                               direction);
-}
+#include <asm-generic/dma-mapping-common.h>
 
-static inline void dma_sync_sg_for_cpu(struct device *dev,
-                                      struct scatterlist *sg, int nelems,
-                                      enum dma_data_direction direction)
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+                                      dma_addr_t *dma_handle, gfp_t flag)
 {
-       dma_ops->sync_sg_for_cpu(dev, sg, nelems, direction);
-}
+       struct dma_map_ops *ops = get_dma_ops(dev);
+       void *cpu_addr;
 
-static inline void dma_sync_sg_for_device(struct device *dev,
-                                         struct scatterlist *sg, int nelems,
-                                         enum dma_data_direction direction)
-{
-       if (dma_ops->sync_sg_for_device)
-               dma_ops->sync_sg_for_device(dev, sg, nelems, direction);
+       cpu_addr = ops->alloc_coherent(dev, size, dma_handle, flag);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+       return cpu_addr;
 }
 
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-                                                dma_addr_t dma_handle,
-                                                unsigned long offset,
-                                                size_t size,
-                                                enum dma_data_direction dir)
+static inline void dma_free_coherent(struct device *dev, size_t size,
+                                    void *cpu_addr, dma_addr_t dma_handle)
 {
-       dma_sync_single_for_cpu(dev, dma_handle+offset, size, dir);
-}
+       struct dma_map_ops *ops = get_dma_ops(dev);
 
-static inline void dma_sync_single_range_for_device(struct device *dev,
-                                                   dma_addr_t dma_handle,
-                                                   unsigned long offset,
-                                                   size_t size,
-                                                   enum dma_data_direction dir)
-{
-       dma_sync_single_for_device(dev, dma_handle+offset, size, dir);
+       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+       ops->free_coherent(dev, size, cpu_addr, dma_handle);
 }
 
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
        return (dma_addr == DMA_ERROR_CODE);
index 1934f2c..a0b443c 100644 (file)
@@ -89,8 +89,8 @@ static inline unsigned long get_softint(void)
        return retval;
 }
 
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 extern void *hardirq_stack[NR_CPUS];
 extern void *softirq_stack[NR_CPUS];
index 6e14fd1..d9c031f 100644 (file)
@@ -5,4 +5,7 @@
 #else
 #include <asm/pci_32.h>
 #endif
+
+#include <asm-generic/pci-dma-compat.h>
+
 #endif
index b41c4c1..ac0e836 100644 (file)
@@ -31,42 +31,8 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
  */
 #define PCI_DMA_BUS_IS_PHYS    (0)
 
-#include <asm/scatterlist.h>
-
 struct pci_dev;
 
-/* Allocate and map kernel buffer using consistent mode DMA for a device.
- * hwdev should be valid struct pci_dev pointer for PCI devices.
- */
-extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle);
-
-/* Free and unmap a consistent DMA buffer.
- * cpu_addr is what was returned from pci_alloc_consistent,
- * size must be the same as what as passed into pci_alloc_consistent,
- * and likewise dma_addr must be the same as what *dma_addrp was set to.
- *
- * References to the memory and mappings assosciated with cpu_addr/dma_addr
- * past this call are illegal.
- */
-extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_for_cpu is performed.
- */
-extern dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction);
-
-/* Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction);
-
 /* pci_unmap_{single,page} is not a nop, thus... */
 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
        dma_addr_t ADDR_NAME;
@@ -81,69 +47,6 @@ extern void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
        (((PTR)->LEN_NAME) = (VAL))
 
-/*
- * Same as above, only with pages instead of mapped addresses.
- */
-extern dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
-                       unsigned long offset, size_t size, int direction);
-extern void pci_unmap_page(struct pci_dev *hwdev,
-                       dma_addr_t dma_address, size_t size, int direction);
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scather-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction);
-
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nhwents, int direction);
-
-/* Make physical memory consistent for a single
- * streaming mode DMA translation after a transfer.
- *
- * If you perform a pci_map_single() but wish to interrogate the
- * buffer using the cpu, yet do not wish to teardown the PCI dma
- * mapping, you must call this function before doing so.  At the
- * next point you give the PCI dma address back to the card, you
- * must first perform a pci_dma_sync_for_device, and then the device
- * again owns the buffer.
- */
-extern void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-extern void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction);
-
-/* Make physical memory consistent for a set of streaming
- * mode DMA translations after a transfer.
- *
- * The same as pci_dma_sync_single_* but for a scatter-gather list,
- * same rules and usage.
- */
-extern void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-extern void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction);
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly.  For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
-{
-       return 1;
-}
-
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
                                        enum pci_dma_burst_strategy *strat,
@@ -154,14 +57,6 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 }
 #endif
 
-#define PCI_DMA_ERROR_CODE      (~(dma_addr_t)0x0)
-
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
-                                       dma_addr_t dma_addr)
-{
-        return (dma_addr == PCI_DMA_ERROR_CODE);
-}
-
 struct device_node;
 extern struct device_node *pci_device_to_OF_node(struct pci_dev *pdev);
 
index 7a1e356..5cc9f6a 100644 (file)
@@ -35,37 +35,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
  */
 #define PCI_DMA_BUS_IS_PHYS    (0)
 
-static inline void *pci_alloc_consistent(struct pci_dev *pdev, size_t size,
-                                        dma_addr_t *dma_handle)
-{
-       return dma_alloc_coherent(&pdev->dev, size, dma_handle, GFP_ATOMIC);
-}
-
-static inline void pci_free_consistent(struct pci_dev *pdev, size_t size,
-                                      void *vaddr, dma_addr_t dma_handle)
-{
-       return dma_free_coherent(&pdev->dev, size, vaddr, dma_handle);
-}
-
-static inline dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr,
-                                       size_t size, int direction)
-{
-       return dma_map_single(&pdev->dev, ptr, size,
-                             (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
-                                   size_t size, int direction)
-{
-       dma_unmap_single(&pdev->dev, dma_addr, size,
-                        (enum dma_data_direction) direction);
-}
-
-#define pci_map_page(dev, page, off, size, dir) \
-       pci_map_single(dev, (page_address(page) + (off)), size, dir)
-#define pci_unmap_page(dev,addr,sz,dir) \
-       pci_unmap_single(dev,addr,sz,dir)
-
 /* pci_unmap_{single,page} is not a nop, thus... */
 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)      \
        dma_addr_t ADDR_NAME;
@@ -80,57 +49,6 @@ static inline void pci_unmap_single(struct pci_dev *pdev, dma_addr_t dma_addr,
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)          \
        (((PTR)->LEN_NAME) = (VAL))
 
-static inline int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sg,
-                            int nents, int direction)
-{
-       return dma_map_sg(&pdev->dev, sg, nents,
-                         (enum dma_data_direction) direction);
-}
-
-static inline void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sg,
-                               int nents, int direction)
-{
-       dma_unmap_sg(&pdev->dev, sg, nents,
-                    (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_cpu(struct pci_dev *pdev,
-                                              dma_addr_t dma_handle,
-                                              size_t size, int direction)
-{
-       dma_sync_single_for_cpu(&pdev->dev, dma_handle, size,
-                               (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_single_for_device(struct pci_dev *pdev,
-                                                 dma_addr_t dma_handle,
-                                                 size_t size, int direction)
-{
-       /* No flushing needed to sync cpu writes to the device.  */
-}
-
-static inline void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev,
-                                          struct scatterlist *sg,
-                                          int nents, int direction)
-{
-       dma_sync_sg_for_cpu(&pdev->dev, sg, nents,
-                           (enum dma_data_direction) direction);
-}
-
-static inline void pci_dma_sync_sg_for_device(struct pci_dev *pdev,
-                                             struct scatterlist *sg,
-                                             int nelems, int direction)
-{
-       /* No flushing needed to sync cpu writes to the device.  */
-}
-
-/* Return whether the given PCI device DMA address mask can
- * be supported properly.  For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
-
 /* PCI IOMMU mapping bypass support. */
 
 /* PCI 64-bit addressing works for all slots on all controller
@@ -140,12 +58,6 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
 #define PCI64_REQUIRED_MASK    (~(dma64_addr_t)0)
 #define PCI64_ADDR_BASE                0xfffc000000000000UL
 
-static inline int pci_dma_mapping_error(struct pci_dev *pdev,
-                                       dma_addr_t dma_addr)
-{
-       return dma_mapping_error(&pdev->dev, dma_addr);
-}
-
 #ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
                                        enum pci_dma_burst_strategy *strat,
index 46f91ab..857630c 100644 (file)
@@ -76,7 +76,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
  *
  * Unfortunately this scheme limits us to ~16,000,000 cpus.
  */
-static inline void __read_lock(raw_rwlock_t *rw)
+static inline void arch_read_lock(raw_rwlock_t *rw)
 {
        register raw_rwlock_t *lp asm("g1");
        lp = rw;
@@ -92,11 +92,11 @@ static inline void __read_lock(raw_rwlock_t *rw)
 #define __raw_read_lock(lock) \
 do {   unsigned long flags; \
        local_irq_save(flags); \
-       __read_lock(lock); \
+       arch_read_lock(lock); \
        local_irq_restore(flags); \
 } while(0)
 
-static inline void __read_unlock(raw_rwlock_t *rw)
+static inline void arch_read_unlock(raw_rwlock_t *rw)
 {
        register raw_rwlock_t *lp asm("g1");
        lp = rw;
@@ -112,7 +112,7 @@ static inline void __read_unlock(raw_rwlock_t *rw)
 #define __raw_read_unlock(lock) \
 do {   unsigned long flags; \
        local_irq_save(flags); \
-       __read_unlock(lock); \
+       arch_read_unlock(lock); \
        local_irq_restore(flags); \
 } while(0)
 
@@ -150,7 +150,7 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
        return (val == 0);
 }
 
-static inline int __read_trylock(raw_rwlock_t *rw)
+static inline int arch_read_trylock(raw_rwlock_t *rw)
 {
        register raw_rwlock_t *lp asm("g1");
        register int res asm("o0");
@@ -169,7 +169,7 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 ({     unsigned long flags; \
        int res; \
        local_irq_save(flags); \
-       res = __read_trylock(lock); \
+       res = arch_read_trylock(lock); \
        local_irq_restore(flags); \
        res; \
 })
index f6b2b92..43e5147 100644 (file)
@@ -92,7 +92,7 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
 
 /* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
 
-static void inline __read_lock(raw_rwlock_t *lock)
+static void inline arch_read_lock(raw_rwlock_t *lock)
 {
        unsigned long tmp1, tmp2;
 
@@ -115,7 +115,7 @@ static void inline __read_lock(raw_rwlock_t *lock)
        : "memory");
 }
 
-static int inline __read_trylock(raw_rwlock_t *lock)
+static int inline arch_read_trylock(raw_rwlock_t *lock)
 {
        int tmp1, tmp2;
 
@@ -136,7 +136,7 @@ static int inline __read_trylock(raw_rwlock_t *lock)
        return tmp1;
 }
 
-static void inline __read_unlock(raw_rwlock_t *lock)
+static void inline arch_read_unlock(raw_rwlock_t *lock)
 {
        unsigned long tmp1, tmp2;
 
@@ -152,7 +152,7 @@ static void inline __read_unlock(raw_rwlock_t *lock)
        : "memory");
 }
 
-static void inline __write_lock(raw_rwlock_t *lock)
+static void inline arch_write_lock(raw_rwlock_t *lock)
 {
        unsigned long mask, tmp1, tmp2;
 
@@ -177,7 +177,7 @@ static void inline __write_lock(raw_rwlock_t *lock)
        : "memory");
 }
 
-static void inline __write_unlock(raw_rwlock_t *lock)
+static void inline arch_write_unlock(raw_rwlock_t *lock)
 {
        __asm__ __volatile__(
 "      stw             %%g0, [%0]"
@@ -186,7 +186,7 @@ static void inline __write_unlock(raw_rwlock_t *lock)
        : "memory");
 }
 
-static int inline __write_trylock(raw_rwlock_t *lock)
+static int inline arch_write_trylock(raw_rwlock_t *lock)
 {
        unsigned long mask, tmp1, tmp2, result;
 
@@ -210,14 +210,14 @@ static int inline __write_trylock(raw_rwlock_t *lock)
        return result;
 }
 
-#define __raw_read_lock(p)     __read_lock(p)
-#define __raw_read_lock_flags(p, f) __read_lock(p)
-#define __raw_read_trylock(p)  __read_trylock(p)
-#define __raw_read_unlock(p)   __read_unlock(p)
-#define __raw_write_lock(p)    __write_lock(p)
-#define __raw_write_lock_flags(p, f) __write_lock(p)
-#define __raw_write_unlock(p)  __write_unlock(p)
-#define __raw_write_trylock(p) __write_trylock(p)
+#define __raw_read_lock(p)     arch_read_lock(p)
+#define __raw_read_lock_flags(p, f) arch_read_lock(p)
+#define __raw_read_trylock(p)  arch_read_trylock(p)
+#define __raw_read_unlock(p)   arch_read_unlock(p)
+#define __raw_write_lock(p)    arch_write_lock(p)
+#define __raw_write_lock_flags(p, f) arch_write_lock(p)
+#define __raw_write_unlock(p)  arch_write_unlock(p)
+#define __raw_write_trylock(p) arch_write_trylock(p)
 
 #define __raw_read_can_lock(rw)                (!((rw)->lock & 0x80000000UL))
 #define __raw_write_can_lock(rw)       (!(rw)->lock)
index 475ce46..29b88a5 100644 (file)
@@ -61,7 +61,7 @@ obj-$(CONFIG_SPARC64_SMP) += cpumap.o
 obj-$(CONFIG_SPARC32)     += devres.o
 devres-y                  := ../../../kernel/irq/devres.o
 
-obj-$(CONFIG_SPARC32)     += dma.o
+obj-y                     += dma.o
 
 obj-$(CONFIG_SPARC32_PCI) += pcic.o
 
index 524c32f..e1ba8ee 100644 (file)
-/* dma.c: PCI and SBUS DMA accessors for 32-bit sparc.
- *
- * Copyright (C) 2008 David S. Miller <davem@davemloft.net>
- */
-
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-#include <linux/mm.h>
-
-#ifdef CONFIG_PCI
-#include <linux/pci.h>
-#endif
+#include <linux/dma-debug.h>
 
-#include "dma.h"
+#define PREALLOC_DMA_DEBUG_ENTRIES       (1 << 15)
 
-int dma_supported(struct device *dev, u64 mask)
+static int __init dma_init(void)
 {
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               return pci_dma_supported(to_pci_dev(dev), mask);
-#endif
+       dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
        return 0;
 }
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
-#endif
-       return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-static void *dma32_alloc_coherent(struct device *dev, size_t size,
-                                 dma_addr_t *dma_handle, gfp_t flag)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               return pci_alloc_consistent(to_pci_dev(dev), size, dma_handle);
-#endif
-       return sbus_alloc_consistent(dev, size, dma_handle);
-}
-
-static void dma32_free_coherent(struct device *dev, size_t size,
-                               void *cpu_addr, dma_addr_t dma_handle)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_free_consistent(to_pci_dev(dev), size,
-                                   cpu_addr, dma_handle);
-               return;
-       }
-#endif
-       sbus_free_consistent(dev, size, cpu_addr, dma_handle);
-}
-
-static dma_addr_t dma32_map_page(struct device *dev, struct page *page,
-                                unsigned long offset, size_t size,
-                                enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               return pci_map_page(to_pci_dev(dev), page, offset,
-                                   size, (int)direction);
-#endif
-       return sbus_map_single(dev, page_address(page) + offset,
-                              size, (int)direction);
-}
-
-static void dma32_unmap_page(struct device *dev, dma_addr_t dma_address,
-                            size_t size, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_unmap_page(to_pci_dev(dev), dma_address,
-                              size, (int)direction);
-               return;
-       }
-#endif
-       sbus_unmap_single(dev, dma_address, size, (int)direction);
-}
-
-static int dma32_map_sg(struct device *dev, struct scatterlist *sg,
-                       int nents, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
-#endif
-       return sbus_map_sg(dev, sg, nents, direction);
-}
-
-void dma32_unmap_sg(struct device *dev, struct scatterlist *sg,
-                   int nents, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_unmap_sg(to_pci_dev(dev), sg, nents, (int)direction);
-               return;
-       }
-#endif
-       sbus_unmap_sg(dev, sg, nents, (int)direction);
-}
-
-static void dma32_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
-                                     size_t size,
-                                     enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
-                                           size, (int)direction);
-               return;
-       }
-#endif
-       sbus_dma_sync_single_for_cpu(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_single_for_device(struct device *dev,
-                                        dma_addr_t dma_handle, size_t size,
-                                        enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
-                                              size, (int)direction);
-               return;
-       }
-#endif
-       sbus_dma_sync_single_for_device(dev, dma_handle, size, (int) direction);
-}
-
-static void dma32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-                                 int nelems, enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg,
-                                       nelems, (int)direction);
-               return;
-       }
-#endif
-       BUG();
-}
-
-static void dma32_sync_sg_for_device(struct device *dev,
-                                    struct scatterlist *sg, int nelems,
-                                    enum dma_data_direction direction)
-{
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type) {
-               pci_dma_sync_sg_for_device(to_pci_dev(dev), sg,
-                                          nelems, (int)direction);
-               return;
-       }
-#endif
-       BUG();
-}
-
-static const struct dma_ops dma32_dma_ops = {
-       .alloc_coherent         = dma32_alloc_coherent,
-       .free_coherent          = dma32_free_coherent,
-       .map_page               = dma32_map_page,
-       .unmap_page             = dma32_unmap_page,
-       .map_sg                 = dma32_map_sg,
-       .unmap_sg               = dma32_unmap_sg,
-       .sync_single_for_cpu    = dma32_sync_single_for_cpu,
-       .sync_single_for_device = dma32_sync_single_for_device,
-       .sync_sg_for_cpu        = dma32_sync_sg_for_cpu,
-       .sync_sg_for_device     = dma32_sync_sg_for_device,
-};
-
-const struct dma_ops *dma_ops = &dma32_dma_ops;
-EXPORT_SYMBOL(dma_ops);
+fs_initcall(dma_init);
diff --git a/arch/sparc/kernel/dma.h b/arch/sparc/kernel/dma.h
deleted file mode 100644 (file)
index f8d8951..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp);
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba);
-dma_addr_t sbus_map_single(struct device *dev, void *va,
-                          size_t len, int direction);
-void sbus_unmap_single(struct device *dev, dma_addr_t ba,
-                      size_t n, int direction);
-int sbus_map_sg(struct device *dev, struct scatterlist *sg,
-               int n, int direction);
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg,
-                  int n, int direction);
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
-                                 size_t size, int direction);
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba,
-                                    size_t size, int direction);
index 0aeaefe..7690cc2 100644 (file)
@@ -353,7 +353,8 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
 
 static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
                                  unsigned long offset, size_t sz,
-                                 enum dma_data_direction direction)
+                                 enum dma_data_direction direction,
+                                 struct dma_attrs *attrs)
 {
        struct iommu *iommu;
        struct strbuf *strbuf;
@@ -474,7 +475,8 @@ do_flush_sync:
 }
 
 static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
-                             size_t sz, enum dma_data_direction direction)
+                             size_t sz, enum dma_data_direction direction,
+                             struct dma_attrs *attrs)
 {
        struct iommu *iommu;
        struct strbuf *strbuf;
@@ -520,7 +522,8 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
 }
 
 static int dma_4u_map_sg(struct device *dev, struct scatterlist *sglist,
-                        int nelems, enum dma_data_direction direction)
+                        int nelems, enum dma_data_direction direction,
+                        struct dma_attrs *attrs)
 {
        struct scatterlist *s, *outs, *segstart;
        unsigned long flags, handle, prot, ctx;
@@ -691,7 +694,8 @@ static unsigned long fetch_sg_ctx(struct iommu *iommu, struct scatterlist *sg)
 }
 
 static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
-                           int nelems, enum dma_data_direction direction)
+                           int nelems, enum dma_data_direction direction,
+                           struct dma_attrs *attrs)
 {
        unsigned long flags, ctx;
        struct scatterlist *sg;
@@ -822,7 +826,7 @@ static void dma_4u_sync_sg_for_cpu(struct device *dev,
        spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static const struct dma_ops sun4u_dma_ops = {
+static struct dma_map_ops sun4u_dma_ops = {
        .alloc_coherent         = dma_4u_alloc_coherent,
        .free_coherent          = dma_4u_free_coherent,
        .map_page               = dma_4u_map_page,
@@ -833,9 +837,11 @@ static const struct dma_ops sun4u_dma_ops = {
        .sync_sg_for_cpu        = dma_4u_sync_sg_for_cpu,
 };
 
-const struct dma_ops *dma_ops = &sun4u_dma_ops;
+struct dma_map_ops *dma_ops = &sun4u_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
+extern int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask);
+
 int dma_supported(struct device *dev, u64 device_mask)
 {
        struct iommu *iommu = dev->archdata.iommu;
@@ -849,7 +855,7 @@ int dma_supported(struct device *dev, u64 device_mask)
 
 #ifdef CONFIG_PCI
        if (dev->bus == &pci_bus_type)
-               return pci_dma_supported(to_pci_dev(dev), device_mask);
+               return pci64_dma_supported(to_pci_dev(dev), device_mask);
 #endif
 
        return 0;
index 87ea0d0..edbea23 100644 (file)
@@ -48,8 +48,6 @@
 #include <asm/iommu.h>
 #include <asm/io-unit.h>
 
-#include "dma.h"
-
 #define mmu_inval_dma_area(p, l)       /* Anton pulled it out for 2.4.0-xx */
 
 static struct resource *_sparc_find_resource(struct resource *r,
@@ -246,7 +244,8 @@ EXPORT_SYMBOL(sbus_set_sbus64);
  * Typically devices use them for control blocks.
  * CPU may access them without any explicit flushing.
  */
-void *sbus_alloc_consistent(struct device *dev, long len, u32 *dma_addrp)
+static void *sbus_alloc_coherent(struct device *dev, size_t len,
+                                dma_addr_t *dma_addrp, gfp_t gfp)
 {
        struct of_device *op = to_of_device(dev);
        unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
@@ -299,7 +298,8 @@ err_nopages:
        return NULL;
 }
 
-void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
+static void sbus_free_coherent(struct device *dev, size_t n, void *p,
+                              dma_addr_t ba)
 {
        struct resource *res;
        struct page *pgv;
@@ -317,7 +317,7 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
 
        n = (n + PAGE_SIZE-1) & PAGE_MASK;
        if ((res->end-res->start)+1 != n) {
-               printk("sbus_free_consistent: region 0x%lx asked 0x%lx\n",
+               printk("sbus_free_consistent: region 0x%lx asked 0x%zx\n",
                    (long)((res->end-res->start)+1), n);
                return;
        }
@@ -337,8 +337,13 @@ void sbus_free_consistent(struct device *dev, long n, void *p, u32 ba)
  * CPU view of this memory may be inconsistent with
  * a device view and explicit flushing is necessary.
  */
-dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int direction)
+static dma_addr_t sbus_map_page(struct device *dev, struct page *page,
+                               unsigned long offset, size_t len,
+                               enum dma_data_direction dir,
+                               struct dma_attrs *attrs)
 {
+       void *va = page_address(page) + offset;
+
        /* XXX why are some lengths signed, others unsigned? */
        if (len <= 0) {
                return 0;
@@ -350,12 +355,14 @@ dma_addr_t sbus_map_single(struct device *dev, void *va, size_t len, int directi
        return mmu_get_scsi_one(dev, va, len);
 }
 
-void sbus_unmap_single(struct device *dev, dma_addr_t ba, size_t n, int direction)
+static void sbus_unmap_page(struct device *dev, dma_addr_t ba, size_t n,
+                           enum dma_data_direction dir, struct dma_attrs *attrs)
 {
        mmu_release_scsi_one(dev, ba, n);
 }
 
-int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n,
+                      enum dma_data_direction dir, struct dma_attrs *attrs)
 {
        mmu_get_scsi_sgl(dev, sg, n);
 
@@ -366,19 +373,38 @@ int sbus_map_sg(struct device *dev, struct scatterlist *sg, int n, int direction
        return n;
 }
 
-void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n, int direction)
+static void sbus_unmap_sg(struct device *dev, struct scatterlist *sg, int n,
+                         enum dma_data_direction dir, struct dma_attrs *attrs)
 {
        mmu_release_scsi_sgl(dev, sg, n);
 }
 
-void sbus_dma_sync_single_for_cpu(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+                                int n, enum dma_data_direction dir)
 {
+       BUG();
 }
 
-void sbus_dma_sync_single_for_device(struct device *dev, dma_addr_t ba, size_t size, int direction)
+static void sbus_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+                                   int n, enum dma_data_direction dir)
 {
+       BUG();
 }
 
+struct dma_map_ops sbus_dma_ops = {
+       .alloc_coherent         = sbus_alloc_coherent,
+       .free_coherent          = sbus_free_coherent,
+       .map_page               = sbus_map_page,
+       .unmap_page             = sbus_unmap_page,
+       .map_sg                 = sbus_map_sg,
+       .unmap_sg               = sbus_unmap_sg,
+       .sync_sg_for_cpu        = sbus_sync_sg_for_cpu,
+       .sync_sg_for_device     = sbus_sync_sg_for_device,
+};
+
+struct dma_map_ops *dma_ops = &sbus_dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
 static int __init sparc_register_ioport(void)
 {
        register_proc_sparc_ioport();
@@ -395,7 +421,8 @@ arch_initcall(sparc_register_ioport);
 /* Allocate and map kernel buffer using consistent mode DMA for a device.
  * hwdev should be valid struct pci_dev pointer for PCI devices.
  */
-void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
+static void *pci32_alloc_coherent(struct device *dev, size_t len,
+                                 dma_addr_t *pba, gfp_t gfp)
 {
        unsigned long len_total = (len + PAGE_SIZE-1) & PAGE_MASK;
        unsigned long va;
@@ -439,7 +466,6 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t len, dma_addr_t *pba)
        *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */
        return (void *) res->start;
 }
-EXPORT_SYMBOL(pci_alloc_consistent);
 
 /* Free and unmap a consistent DMA buffer.
  * cpu_addr is what was returned from pci_alloc_consistent,
@@ -449,7 +475,8 @@ EXPORT_SYMBOL(pci_alloc_consistent);
  * References to the memory and mappings associated with cpu_addr/dma_addr
  * past this call are illegal.
  */
-void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
+static void pci32_free_coherent(struct device *dev, size_t n, void *p,
+                               dma_addr_t ba)
 {
        struct resource *res;
        unsigned long pgp;
@@ -481,60 +508,18 @@ void pci_free_consistent(struct pci_dev *pdev, size_t n, void *p, dma_addr_t ba)
 
        free_pages(pgp, get_order(n));
 }
-EXPORT_SYMBOL(pci_free_consistent);
-
-/* Map a single buffer of the indicated size for DMA in streaming mode.
- * The 32-bit bus address to use is returned.
- *
- * Once the device is given the dma address, the device owns this memory
- * until either pci_unmap_single or pci_dma_sync_single_* is performed.
- */
-dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
-    int direction)
-{
-       BUG_ON(direction == PCI_DMA_NONE);
-       /* IIep is write-through, not flushing. */
-       return virt_to_phys(ptr);
-}
-EXPORT_SYMBOL(pci_map_single);
-
-/* Unmap a single streaming mode DMA translation.  The dma_addr and size
- * must match what was provided for in a previous pci_map_single call.  All
- * other usages are undefined.
- *
- * After this call, reads by the cpu to the buffer are guaranteed to see
- * whatever the device wrote there.
- */
-void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t ba, size_t size,
-    int direction)
-{
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
-               mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
-                   (size + PAGE_SIZE-1) & PAGE_MASK);
-       }
-}
-EXPORT_SYMBOL(pci_unmap_single);
 
 /*
  * Same as pci_map_single, but with pages.
  */
-dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
-                       unsigned long offset, size_t size, int direction)
+static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
+                                unsigned long offset, size_t size,
+                                enum dma_data_direction dir,
+                                struct dma_attrs *attrs)
 {
-       BUG_ON(direction == PCI_DMA_NONE);
        /* IIep is write-through, not flushing. */
        return page_to_phys(page) + offset;
 }
-EXPORT_SYMBOL(pci_map_page);
-
-void pci_unmap_page(struct pci_dev *hwdev,
-                       dma_addr_t dma_address, size_t size, int direction)
-{
-       BUG_ON(direction == PCI_DMA_NONE);
-       /* mmu_inval_dma_area XXX */
-}
-EXPORT_SYMBOL(pci_unmap_page);
 
 /* Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scather-gather version of the
@@ -551,13 +536,13 @@ EXPORT_SYMBOL(pci_unmap_page);
  * Device ownership issues as mentioned above for pci_map_single are
  * the same here.
  */
-int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
-    int direction)
+static int pci32_map_sg(struct device *device, struct scatterlist *sgl,
+                       int nents, enum dma_data_direction dir,
+                       struct dma_attrs *attrs)
 {
        struct scatterlist *sg;
        int n;
 
-       BUG_ON(direction == PCI_DMA_NONE);
        /* IIep is write-through, not flushing. */
        for_each_sg(sgl, sg, nents, n) {
                BUG_ON(page_address(sg_page(sg)) == NULL);
@@ -566,20 +551,19 @@ int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
        }
        return nents;
 }
-EXPORT_SYMBOL(pci_map_sg);
 
 /* Unmap a set of streaming mode DMA translations.
  * Again, cpu read rules concerning calls here are the same as for
  * pci_unmap_single() above.
  */
-void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
-    int direction)
+static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
+                          int nents, enum dma_data_direction dir,
+                          struct dma_attrs *attrs)
 {
        struct scatterlist *sg;
        int n;
 
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
+       if (dir != PCI_DMA_TODEVICE) {
                for_each_sg(sgl, sg, nents, n) {
                        BUG_ON(page_address(sg_page(sg)) == NULL);
                        mmu_inval_dma_area(
@@ -588,7 +572,6 @@ void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sgl, int nents,
                }
        }
 }
-EXPORT_SYMBOL(pci_unmap_sg);
 
 /* Make physical memory consistent for a single
  * streaming mode DMA translation before or after a transfer.
@@ -600,25 +583,23 @@ EXPORT_SYMBOL(pci_unmap_sg);
  * must first perform a pci_dma_sync_for_device, and then the
  * device again owns the buffer.
  */
-void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba,
+                                     size_t size, enum dma_data_direction dir)
 {
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
+       if (dir != PCI_DMA_TODEVICE) {
                mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
                    (size + PAGE_SIZE-1) & PAGE_MASK);
        }
 }
-EXPORT_SYMBOL(pci_dma_sync_single_for_cpu);
 
-void pci_dma_sync_single_for_device(struct pci_dev *hwdev, dma_addr_t ba, size_t size, int direction)
+static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba,
+                                        size_t size, enum dma_data_direction dir)
 {
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
+       if (dir != PCI_DMA_TODEVICE) {
                mmu_inval_dma_area((unsigned long)phys_to_virt(ba),
                    (size + PAGE_SIZE-1) & PAGE_MASK);
        }
 }
-EXPORT_SYMBOL(pci_dma_sync_single_for_device);
 
 /* Make physical memory consistent for a set of streaming
  * mode DMA translations after a transfer.
@@ -626,13 +607,13 @@ EXPORT_SYMBOL(pci_dma_sync_single_for_device);
  * The same as pci_dma_sync_single_* but for a scatter-gather list,
  * same rules and usage.
  */
-void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+                                 int nents, enum dma_data_direction dir)
 {
        struct scatterlist *sg;
        int n;
 
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
+       if (dir != PCI_DMA_TODEVICE) {
                for_each_sg(sgl, sg, nents, n) {
                        BUG_ON(page_address(sg_page(sg)) == NULL);
                        mmu_inval_dma_area(
@@ -641,15 +622,14 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *hwdev, struct scatterlist *sgl, int
                }
        }
 }
-EXPORT_SYMBOL(pci_dma_sync_sg_for_cpu);
 
-void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl, int nents, int direction)
+static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl,
+                                    int nents, enum dma_data_direction dir)
 {
        struct scatterlist *sg;
        int n;
 
-       BUG_ON(direction == PCI_DMA_NONE);
-       if (direction != PCI_DMA_TODEVICE) {
+       if (dir != PCI_DMA_TODEVICE) {
                for_each_sg(sgl, sg, nents, n) {
                        BUG_ON(page_address(sg_page(sg)) == NULL);
                        mmu_inval_dma_area(
@@ -658,9 +638,49 @@ void pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sgl,
                }
        }
 }
-EXPORT_SYMBOL(pci_dma_sync_sg_for_device);
+
+struct dma_map_ops pci32_dma_ops = {
+       .alloc_coherent         = pci32_alloc_coherent,
+       .free_coherent          = pci32_free_coherent,
+       .map_page               = pci32_map_page,
+       .map_sg                 = pci32_map_sg,
+       .unmap_sg               = pci32_unmap_sg,
+       .sync_single_for_cpu    = pci32_sync_single_for_cpu,
+       .sync_single_for_device = pci32_sync_single_for_device,
+       .sync_sg_for_cpu        = pci32_sync_sg_for_cpu,
+       .sync_sg_for_device     = pci32_sync_sg_for_device,
+};
+EXPORT_SYMBOL(pci32_dma_ops);
+
 #endif /* CONFIG_PCI */
 
+/*
+ * Return whether the given PCI device DMA address mask can be
+ * supported properly.  For example, if your device can only drive the
+ * low 24-bits during PCI bus mastering, then you would pass
+ * 0x00ffffff as the mask to this function.
+ */
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+       if (dev->bus == &pci_bus_type)
+               return 1;
+#endif
+       return 0;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+#ifdef CONFIG_PCI
+       if (dev->bus == &pci_bus_type)
+               return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
+#endif
+       return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+
 #ifdef CONFIG_PROC_FS
 
 static int
index 57859ad..c686486 100644 (file)
@@ -1039,7 +1039,7 @@ static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
        pci_dev_put(ali_isa_bridge);
 }
 
-int pci_dma_supported(struct pci_dev *pdev, u64 device_mask)
+int pci64_dma_supported(struct pci_dev *pdev, u64 device_mask)
 {
        u64 dma_addr_mask;
 
index 2485eaa..23c33ff 100644 (file)
@@ -232,7 +232,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
 
 static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
                                  unsigned long offset, size_t sz,
-                                 enum dma_data_direction direction)
+                                 enum dma_data_direction direction,
+                                 struct dma_attrs *attrs)
 {
        struct iommu *iommu;
        unsigned long flags, npages, oaddr;
@@ -296,7 +297,8 @@ iommu_map_fail:
 }
 
 static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
-                             size_t sz, enum dma_data_direction direction)
+                             size_t sz, enum dma_data_direction direction,
+                             struct dma_attrs *attrs)
 {
        struct pci_pbm_info *pbm;
        struct iommu *iommu;
@@ -336,7 +338,8 @@ static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
 }
 
 static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist,
-                        int nelems, enum dma_data_direction direction)
+                        int nelems, enum dma_data_direction direction,
+                        struct dma_attrs *attrs)
 {
        struct scatterlist *s, *outs, *segstart;
        unsigned long flags, handle, prot;
@@ -478,7 +481,8 @@ iommu_map_failed:
 }
 
 static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
-                           int nelems, enum dma_data_direction direction)
+                           int nelems, enum dma_data_direction direction,
+                           struct dma_attrs *attrs)
 {
        struct pci_pbm_info *pbm;
        struct scatterlist *sg;
@@ -521,29 +525,13 @@ static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
        spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-static void dma_4v_sync_single_for_cpu(struct device *dev,
-                                      dma_addr_t bus_addr, size_t sz,
-                                      enum dma_data_direction direction)
-{
-       /* Nothing to do... */
-}
-
-static void dma_4v_sync_sg_for_cpu(struct device *dev,
-                                  struct scatterlist *sglist, int nelems,
-                                  enum dma_data_direction direction)
-{
-       /* Nothing to do... */
-}
-
-static const struct dma_ops sun4v_dma_ops = {
+static struct dma_map_ops sun4v_dma_ops = {
        .alloc_coherent                 = dma_4v_alloc_coherent,
        .free_coherent                  = dma_4v_free_coherent,
        .map_page                       = dma_4v_map_page,
        .unmap_page                     = dma_4v_unmap_page,
        .map_sg                         = dma_4v_map_sg,
        .unmap_sg                       = dma_4v_unmap_sg,
-       .sync_single_for_cpu            = dma_4v_sync_single_for_cpu,
-       .sync_sg_for_cpu                = dma_4v_sync_sg_for_cpu,
 };
 
 static void __devinit pci_sun4v_scan_bus(struct pci_pbm_info *pbm,
index 4041f94..18d6785 100644 (file)
@@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp)
        }
 }
 
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
 {
        struct thread_info *tp = current_thread_info();
        struct pt_regs *regs = get_irq_regs();
@@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void)
 
 static void sysrq_handle_globreg(int key, struct tty_struct *tty)
 {
-       __trigger_all_cpu_backtrace();
+       arch_trigger_all_cpu_backtrace();
 }
 
 static struct sysrq_key_op sparc_globalreg_op = {
index 13ffa5d..fc20fdc 100644 (file)
@@ -38,7 +38,7 @@ config X86
        select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_TRACE_MCOUNT_TEST
        select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
-       select HAVE_FTRACE_SYSCALLS
+       select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_KVM
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
@@ -586,7 +586,6 @@ config GART_IOMMU
        bool "GART IOMMU support" if EMBEDDED
        default y
        select SWIOTLB
-       select AGP
        depends on X86_64 && PCI
        ---help---
          Support for full DMA access of devices with 32bit memory access only
index edb992e..d28fad1 100644 (file)
@@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
 CONFIG_TRACING_SUPPORT=y
index cee1dd2..6c86acd 100644 (file)
@@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
-CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
 CONFIG_TRACING_SUPPORT=y
index bdf96f1..ac95995 100644 (file)
@@ -25,6 +25,7 @@
 #ifdef CONFIG_AMD_IOMMU
 extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
+extern int amd_iommu_init_passthrough(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
 extern void amd_iommu_flush_all_domains(void);
index 0c878ca..2a2cc7a 100644 (file)
 #define EVT_BUFFER_SIZE                8192 /* 512 entries */
 #define EVT_LEN_MASK           (0x9ULL << 56)
 
+#define PAGE_MODE_NONE    0x00
 #define PAGE_MODE_1_LEVEL 0x01
 #define PAGE_MODE_2_LEVEL 0x02
 #define PAGE_MODE_3_LEVEL 0x03
-
-#define IOMMU_PDE_NL_0   0x000ULL
-#define IOMMU_PDE_NL_1   0x200ULL
-#define IOMMU_PDE_NL_2   0x400ULL
-#define IOMMU_PDE_NL_3   0x600ULL
-
-#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
-#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
-#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
-
-#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
-#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
-#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
+#define PAGE_MODE_4_LEVEL 0x04
+#define PAGE_MODE_5_LEVEL 0x05
+#define PAGE_MODE_6_LEVEL 0x06
+
+#define PM_LEVEL_SHIFT(x)      (12 + ((x) * 9))
+#define PM_LEVEL_SIZE(x)       (((x) < 6) ? \
+                                 ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
+                                  (0xffffffffffffffffULL))
+#define PM_LEVEL_INDEX(x, a)   (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
+#define PM_LEVEL_ENC(x)                (((x) << 9) & 0xe00ULL)
+#define PM_LEVEL_PDE(x, a)     ((a) | PM_LEVEL_ENC((x)) | \
+                                IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
+#define PM_PTE_LEVEL(pte)      (((pte) >> 9) & 0x7ULL)
+
+#define PM_MAP_4k              0
+#define PM_ADDR_MASK           0x000ffffffffff000ULL
+#define PM_MAP_MASK(lvl)       (PM_ADDR_MASK & \
+                               (~((1ULL << (12 + ((lvl) * 9))) - 1)))
+#define PM_ALIGNED(lvl, addr)  ((PM_MAP_MASK(lvl) & (addr)) == (addr))
 
 #define IOMMU_PTE_P  (1ULL << 0)
 #define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_IR (1ULL << 61)
 #define IOMMU_PTE_IW (1ULL << 62)
 
-#define IOMMU_L1_PDE(address) \
-       ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define IOMMU_L2_PDE(address) \
-       ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-
 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
 #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
 #define PD_DMA_OPS_MASK                (1UL << 0) /* domain used for dma_ops */
 #define PD_DEFAULT_MASK                (1UL << 1) /* domain is a default dma_ops
                                              domain for an IOMMU */
+#define PD_PASSTHROUGH_MASK    (1UL << 2) /* domain has no page
+                                             translation */
+
 extern bool amd_iommu_dump;
 #define DUMP_printk(format, arg...)                                    \
        do {                                                            \
                if (amd_iommu_dump)                                             \
-                       printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+                       printk(KERN_INFO "AMD-Vi: " format, ## arg);    \
        } while(0);
 
 /*
@@ -226,6 +231,7 @@ struct protection_domain {
        int mode;               /* paging mode (0-6 levels) */
        u64 *pt_root;           /* page table root pointer */
        unsigned long flags;    /* flags to find out type of domain */
+       bool updated;           /* complete domain flush required */
        unsigned dev_cnt;       /* devices assigned to this domain */
        void *priv;             /* private data */
 };
@@ -337,6 +343,9 @@ struct amd_iommu {
        /* if one, we need to send a completion wait command */
        bool need_sync;
 
+       /* becomes true if a command buffer reset is running */
+       bool reset_in_progress;
+
        /* default dma_ops domain for that IOMMU */
        struct dma_ops_domain *default_dom;
 };
@@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { }
 
 #endif /* CONFIG_AMD_IOMMU_STATS */
 
+/* some function prototypes */
+extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
+
 #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
index 1c3f943..0ee770d 100644 (file)
@@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask);
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag);
 
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+       if (!dev->dma_mask)
+               return 0;
+
+       return addr + size <= *dev->dma_mask;
+}
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+       return paddr;
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+       return daddr;
+}
+
 static inline void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
        enum dma_data_direction dir)
index bd2c651..db24c22 100644 (file)
 
 #endif
 
-/* FIXME: I don't want to stay hardcoded */
-#ifdef CONFIG_X86_64
-# define FTRACE_SYSCALL_MAX     296
-#else
-# define FTRACE_SYSCALL_MAX     333
-#endif
-
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR            ((long)(mcount))
 #define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
index c86e5ed..e63cf7d 100644 (file)
@@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
                        void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
 
-void __trigger_all_cpu_backtrace(void);
-#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
+void arch_trigger_all_cpu_backtrace(void);
+#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 
 static inline void localise_nmi_watchdog(void)
 {
index fa64e40..e7b7c93 100644 (file)
@@ -84,6 +84,16 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
 
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed counter range, since lower
+ * values are used by actual fixed counters and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define X86_PMC_IDX_FIXED_BTS                          (X86_PMC_IDX_FIXED + 16)
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(void);
index fad7d40..6f7786a 100644 (file)
@@ -95,7 +95,7 @@ struct thread_info {
 #define TIF_DEBUGCTLMSR                25      /* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR                26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES   27      /* task is updating the mmu lazily */
-#define TIF_SYSCALL_FTRACE     28      /* for ftrace syscall instrumentation */
+#define TIF_SYSCALL_TRACEPOINT 28      /* syscall tracepoint instrumentation */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
@@ -118,17 +118,17 @@ struct thread_info {
 #define _TIF_DEBUGCTLMSR       (1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR       (1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES  (1 << TIF_LAZY_MMU_UPDATES)
-#define _TIF_SYSCALL_FTRACE    (1 << TIF_SYSCALL_FTRACE)
+#define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY        \
-       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE |  \
-        _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
+       (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |   \
+        _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT \
        (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |    \
-        _TIF_SYSCALL_FTRACE)
+        _TIF_SYSCALL_TRACEPOINT)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK                                                 \
@@ -137,7 +137,8 @@ struct thread_info {
           _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
 
 /* work to do on any return to user space */
-#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE)
+#define _TIF_ALLWORK_MASK                                              \
+       ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT)
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK                                            \
index 066ef59..26d06e0 100644 (file)
@@ -129,25 +129,34 @@ extern unsigned long node_remap_size[];
 #endif
 
 /* sched_domains SD_NODE_INIT for NUMA machines */
-#define SD_NODE_INIT (struct sched_domain) {           \
-       .min_interval           = 8,                    \
-       .max_interval           = 32,                   \
-       .busy_factor            = 32,                   \
-       .imbalance_pct          = 125,                  \
-       .cache_nice_tries       = SD_CACHE_NICE_TRIES,  \
-       .busy_idx               = 3,                    \
-       .idle_idx               = SD_IDLE_IDX,          \
-       .newidle_idx            = SD_NEWIDLE_IDX,       \
-       .wake_idx               = 1,                    \
-       .forkexec_idx           = SD_FORKEXEC_IDX,      \
-       .flags                  = SD_LOAD_BALANCE       \
-                               | SD_BALANCE_EXEC       \
-                               | SD_BALANCE_FORK       \
-                               | SD_WAKE_AFFINE        \
-                               | SD_WAKE_BALANCE       \
-                               | SD_SERIALIZE,         \
-       .last_balance           = jiffies,              \
-       .balance_interval       = 1,                    \
+#define SD_NODE_INIT (struct sched_domain) {                           \
+       .min_interval           = 8,                                    \
+       .max_interval           = 32,                                   \
+       .busy_factor            = 32,                                   \
+       .imbalance_pct          = 125,                                  \
+       .cache_nice_tries       = SD_CACHE_NICE_TRIES,                  \
+       .busy_idx               = 3,                                    \
+       .idle_idx               = SD_IDLE_IDX,                          \
+       .newidle_idx            = SD_NEWIDLE_IDX,                       \
+       .wake_idx               = 1,                                    \
+       .forkexec_idx           = SD_FORKEXEC_IDX,                      \
+                                                                       \
+       .flags                  = 1*SD_LOAD_BALANCE                     \
+                               | 1*SD_BALANCE_NEWIDLE                  \
+                               | 1*SD_BALANCE_EXEC                     \
+                               | 1*SD_BALANCE_FORK                     \
+                               | 0*SD_WAKE_IDLE                        \
+                               | 1*SD_WAKE_AFFINE                      \
+                               | 1*SD_WAKE_BALANCE                     \
+                               | 0*SD_SHARE_CPUPOWER                   \
+                               | 0*SD_POWERSAVINGS_BALANCE             \
+                               | 0*SD_SHARE_PKG_RESOURCES              \
+                               | 1*SD_SERIALIZE                        \
+                               | 1*SD_WAKE_IDLE_FAR                    \
+                               | 0*SD_PREFER_SIBLING                   \
+                               ,                                       \
+       .last_balance           = jiffies,                              \
+       .balance_interval       = 1,                                    \
 }
 
 #ifdef CONFIG_X86_64_ACPI_NUMA
index 732a307..8deaada 100644 (file)
 
 #ifdef __KERNEL__
 
+#define NR_syscalls 337
+
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_OLD_STAT
index 900e161..b9f3c60 100644 (file)
@@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 #endif /* __NO_STUBS */
 
 #ifdef __KERNEL__
+
+#ifndef COMPILE_OFFSETS
+#include <asm/asm-offsets.h>
+#define NR_syscalls (__NR_syscall_max + 1)
+#endif
+
 /*
  * "Conditional" syscalls
  *
index 6c99f50..98f230f 100644 (file)
@@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
-#ifdef CONFIG_IOMMU_API
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+static struct protection_domain *pt_domain;
+
 static struct iommu_ops amd_iommu_ops;
-#endif
 
 /*
  * general struct to manage commands send to an IOMMU
@@ -55,16 +59,16 @@ struct iommu_cmd {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                             struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64* alloc_pte(struct protection_domain *dom,
-                     unsigned long address, u64
-                     **pte_page, gfp_t gfp);
+static u64 *alloc_pte(struct protection_domain *domain,
+                     unsigned long address, int end_lvl,
+                     u64 **pte_page, gfp_t gfp);
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
                                      unsigned long start_page,
                                      unsigned int pages);
-
-#ifndef BUS_NOTIFY_UNBOUND_DRIVER
-#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
-#endif
+static void reset_iommu_command_buffer(struct amd_iommu *iommu);
+static u64 *fetch_pte(struct protection_domain *domain,
+                     unsigned long address, int map_size);
+static void update_domain(struct protection_domain *domain);
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
  *
  ****************************************************************************/
 
-static void iommu_print_event(void *__evt)
+static void dump_dte_entry(u16 devid)
+{
+       int i;
+
+       for (i = 0; i < 8; ++i)
+               pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+                       amd_iommu_dev_table[devid].data[i]);
+}
+
+static void dump_command(unsigned long phys_addr)
+{
+       struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+       int i;
+
+       for (i = 0; i < 4; ++i)
+               pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 {
        u32 *event = __evt;
        int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
@@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt)
        int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
        u64 address = (u64)(((u64)event[3]) << 32) | event[2];
 
-       printk(KERN_ERR "AMD IOMMU: Event logged [");
+       printk(KERN_ERR "AMD-Vi: Event logged [");
 
        switch (type) {
        case EVENT_TYPE_ILL_DEV:
@@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt)
                       "address=0x%016llx flags=0x%04x]\n",
                       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
                       address, flags);
+               dump_dte_entry(devid);
                break;
        case EVENT_TYPE_IO_FAULT:
                printk("IO_PAGE_FAULT device=%02x:%02x.%x "
@@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt)
                break;
        case EVENT_TYPE_ILL_CMD:
                printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+               reset_iommu_command_buffer(iommu);
+               dump_command(address);
                break;
        case EVENT_TYPE_CMD_HARD_ERR:
                printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
@@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
        tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
 
        while (head != tail) {
-               iommu_print_event(iommu->evt_buf + head);
+               iommu_print_event(iommu, iommu->evt_buf + head);
                head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
        }
 
@@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
        status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
        writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
 
-       if (unlikely(i == EXIT_LOOP_COUNT))
-               panic("AMD IOMMU: Completion wait loop failed\n");
+       if (unlikely(i == EXIT_LOOP_COUNT)) {
+               spin_unlock(&iommu->lock);
+               reset_iommu_command_buffer(iommu);
+               spin_lock(&iommu->lock);
+       }
 }
 
 /*
@@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
 }
 
 /*
+ * This function flushes one domain on one IOMMU
+ */
+static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
+{
+       struct iommu_cmd cmd;
+       unsigned long flags;
+
+       __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+                                     domid, 1, 1);
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       __iommu_queue_command(iommu, &cmd);
+       __iommu_completion_wait(iommu);
+       __iommu_wait_for_completion(iommu);
+       spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
+{
+       int i;
+
+       for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+               if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+                       continue;
+               flush_domain_on_iommu(iommu, i);
+       }
+
+}
+
+/*
  * This function is used to flush the IO/TLB for a given protection domain
  * on every IOMMU in the system
  */
 static void iommu_flush_domain(u16 domid)
 {
-       unsigned long flags;
        struct amd_iommu *iommu;
-       struct iommu_cmd cmd;
 
        INC_STATS_COUNTER(domain_flush_all);
 
-       __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
-                                     domid, 1, 1);
-
-       for_each_iommu(iommu) {
-               spin_lock_irqsave(&iommu->lock, flags);
-               __iommu_queue_command(iommu, &cmd);
-               __iommu_completion_wait(iommu);
-               __iommu_wait_for_completion(iommu);
-               spin_unlock_irqrestore(&iommu->lock, flags);
-       }
+       for_each_iommu(iommu)
+               flush_domain_on_iommu(iommu, domid);
 }
 
 void amd_iommu_flush_all_domains(void)
 {
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               flush_all_domains_on_iommu(iommu);
+}
+
+static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+{
        int i;
 
-       for (i = 1; i < MAX_DOMAIN_ID; ++i) {
-               if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+       for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+               if (iommu != amd_iommu_rlookup_table[i])
                        continue;
-               iommu_flush_domain(i);
+
+               iommu_queue_inv_dev_entry(iommu, i);
+               iommu_completion_wait(iommu);
        }
 }
 
-void amd_iommu_flush_all_devices(void)
+static void flush_devices_by_domain(struct protection_domain *domain)
 {
        struct amd_iommu *iommu;
        int i;
 
        for (i = 0; i <= amd_iommu_last_bdf; ++i) {
-               if (amd_iommu_pd_table[i] == NULL)
+               if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
+                   (amd_iommu_pd_table[i] != domain))
                        continue;
 
                iommu = amd_iommu_rlookup_table[i];
@@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void)
        }
 }
 
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+{
+       pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+
+       if (iommu->reset_in_progress)
+               panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+       iommu->reset_in_progress = true;
+
+       amd_iommu_reset_cmd_buffer(iommu);
+       flush_all_devices_for_iommu(iommu);
+       flush_all_domains_on_iommu(iommu);
+
+       iommu->reset_in_progress = false;
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+       flush_devices_by_domain(NULL);
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
@@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void)
 static int iommu_map_page(struct protection_domain *dom,
                          unsigned long bus_addr,
                          unsigned long phys_addr,
-                         int prot)
+                         int prot,
+                         int map_size)
 {
        u64 __pte, *pte;
 
        bus_addr  = PAGE_ALIGN(bus_addr);
        phys_addr = PAGE_ALIGN(phys_addr);
 
-       /* only support 512GB address spaces for now */
-       if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+       BUG_ON(!PM_ALIGNED(map_size, bus_addr));
+       BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+
+       if (!(prot & IOMMU_PROT_MASK))
                return -EINVAL;
 
-       pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
+       pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
 
        if (IOMMU_PTE_PRESENT(*pte))
                return -EBUSY;
@@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom,
 
        *pte = __pte;
 
+       update_domain(dom);
+
        return 0;
 }
 
 static void iommu_unmap_page(struct protection_domain *dom,
-                            unsigned long bus_addr)
+                            unsigned long bus_addr, int map_size)
 {
-       u64 *pte;
-
-       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return;
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+       u64 *pte = fetch_pte(dom, bus_addr, map_size);
 
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return;
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-       *pte = 0;
+       if (pte)
+               *pte = 0;
 }
 
 /*
@@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 
        for (addr = e->address_start; addr < e->address_end;
             addr += PAGE_SIZE) {
-               ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
+               ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
+                                    PM_MAP_4k);
                if (ret)
                        return ret;
                /*
@@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * This function checks if there is a PTE for a given dma address. If
  * there is one, it returns the pointer to it.
  */
-static u64* fetch_pte(struct protection_domain *domain,
-                     unsigned long address)
+static u64 *fetch_pte(struct protection_domain *domain,
+                     unsigned long address, int map_size)
 {
+       int level;
        u64 *pte;
 
-       pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+       level =  domain->mode - 1;
+       pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return NULL;
+       while (level > map_size) {
+               if (!IOMMU_PTE_PRESENT(*pte))
+                       return NULL;
 
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+               level -= 1;
 
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return NULL;
+               pte = IOMMU_PTE_PAGE(*pte);
+               pte = &pte[PM_LEVEL_INDEX(level, address)];
 
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+               if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+                       pte = NULL;
+                       break;
+               }
+       }
 
        return pte;
 }
@@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
                u64 *pte, *pte_page;
 
                for (i = 0; i < num_ptes; ++i) {
-                       pte = alloc_pte(&dma_dom->domain, address,
+                       pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
                                        &pte_page, gfp);
                        if (!pte)
                                goto out_free;
@@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu,
        for (i = dma_dom->aperture[index]->offset;
             i < dma_dom->aperture_size;
             i += PAGE_SIZE) {
-               u64 *pte = fetch_pte(&dma_dom->domain, i);
+               u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
                if (!pte || !IOMMU_PTE_PRESENT(*pte))
                        continue;
 
                dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
        }
 
+       update_domain(&dma_dom->domain);
+
        return 0;
 
 out_free:
+       update_domain(&dma_dom->domain);
+
        free_page((unsigned long)dma_dom->aperture[index]->bitmap);
 
        kfree(dma_dom->aperture[index]);
@@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
        dma_dom->domain.id = domain_id_alloc();
        if (dma_dom->domain.id == 0)
                goto free_dma_dom;
-       dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+       dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
        dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
        dma_dom->domain.flags = PD_DMA_OPS_MASK;
        dma_dom->domain.priv = dma_dom;
@@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid)
        return dom;
 }
 
+static void set_dte_entry(u16 devid, struct protection_domain *domain)
+{
+       u64 pte_root = virt_to_phys(domain->pt_root);
+
+       pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+                   << DEV_ENTRY_MODE_SHIFT;
+       pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
+
+       amd_iommu_dev_table[devid].data[2] = domain->id;
+       amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+       amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+
+       amd_iommu_pd_table[devid] = domain;
+}
+
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
+static void __attach_device(struct amd_iommu *iommu,
+                           struct protection_domain *domain,
+                           u16 devid)
+{
+       /* lock domain */
+       spin_lock(&domain->lock);
+
+       /* update DTE entry */
+       set_dte_entry(devid, domain);
+
+       domain->dev_cnt += 1;
+
+       /* ready */
+       spin_unlock(&domain->lock);
+}
+
 /*
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
@@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu,
                          u16 devid)
 {
        unsigned long flags;
-       u64 pte_root = virt_to_phys(domain->pt_root);
-
-       domain->dev_cnt += 1;
-
-       pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
-                   << DEV_ENTRY_MODE_SHIFT;
-       pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
        write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-       amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
-       amd_iommu_dev_table[devid].data[2] = domain->id;
-
-       amd_iommu_pd_table[devid] = domain;
+       __attach_device(iommu, domain, devid);
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 
-       /*
-        * We might boot into a crash-kernel here. The crashed kernel
-        * left the caches in the IOMMU dirty. So we have to flush
-        * here to evict all dirty stuff.
-        */
+       /*
+        * We might boot into a crash-kernel here. The crashed kernel
+        * left the caches in the IOMMU dirty. So we have to flush
+        * here to evict all dirty stuff.
+        */
        iommu_queue_inv_dev_entry(iommu, devid);
        iommu_flush_tlb_pde(iommu, domain->id);
 }
@@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
 
        /* ready */
        spin_unlock(&domain->lock);
+
+       /*
+        * If we run in passthrough mode the device must be assigned to the
+        * passthrough domain if it is detached from any other domain
+        */
+       if (iommu_pass_through) {
+               struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+               __attach_device(iommu, pt_domain, devid);
+       }
 }
 
 /*
@@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb,
        case BUS_NOTIFY_UNBOUND_DRIVER:
                if (!domain)
                        goto out;
+               if (iommu_pass_through)
+                       break;
                detach_device(domain, devid);
                break;
        case BUS_NOTIFY_ADD_DEVICE:
@@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev,
        return 1;
 }
 
+static void update_device_table(struct protection_domain *domain)
+{
+       unsigned long flags;
+       int i;
+
+       for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+               if (amd_iommu_pd_table[i] != domain)
+                       continue;
+               write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+               set_dte_entry(i, domain);
+               write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       }
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+       if (!domain->updated)
+               return;
+
+       update_device_table(domain);
+       flush_devices_by_domain(domain);
+       iommu_flush_domain(domain->id);
+
+       domain->updated = false;
+}
+
 /*
- * If the pte_page is not yet allocated this function is called
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
  */
-static u64* alloc_pte(struct protection_domain *dom,
-                     unsigned long address, u64 **pte_page, gfp_t gfp)
+static bool increase_address_space(struct protection_domain *domain,
+                                  gfp_t gfp)
+{
+       u64 *pte;
+
+       if (domain->mode == PAGE_MODE_6_LEVEL)
+               /* address space already 64 bit large */
+               return false;
+
+       pte = (void *)get_zeroed_page(gfp);
+       if (!pte)
+               return false;
+
+       *pte             = PM_LEVEL_PDE(domain->mode,
+                                       virt_to_phys(domain->pt_root));
+       domain->pt_root  = pte;
+       domain->mode    += 1;
+       domain->updated  = true;
+
+       return true;
+}
+
+static u64 *alloc_pte(struct protection_domain *domain,
+                     unsigned long address,
+                     int end_lvl,
+                     u64 **pte_page,
+                     gfp_t gfp)
 {
        u64 *pte, *page;
+       int level;
 
-       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+       while (address > PM_LEVEL_SIZE(domain->mode))
+               increase_address_space(domain, gfp);
 
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(gfp);
-               if (!page)
-                       return NULL;
-               *pte = IOMMU_L2_PDE(virt_to_phys(page));
-       }
+       level =  domain->mode - 1;
+       pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+       while (level > end_lvl) {
+               if (!IOMMU_PTE_PRESENT(*pte)) {
+                       page = (u64 *)get_zeroed_page(gfp);
+                       if (!page)
+                               return NULL;
+                       *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+               }
 
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(gfp);
-               if (!page)
-                       return NULL;
-               *pte = IOMMU_L1_PDE(virt_to_phys(page));
-       }
+               level -= 1;
 
-       pte = IOMMU_PTE_PAGE(*pte);
+               pte = IOMMU_PTE_PAGE(*pte);
 
-       if (pte_page)
-               *pte_page = pte;
+               if (pte_page && level == end_lvl)
+                       *pte_page = pte;
 
-       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+               pte = &pte[PM_LEVEL_INDEX(level, address)];
+       }
 
        return pte;
 }
@@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 
        pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
        if (!pte) {
-               pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+               pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+                               GFP_ATOMIC);
                aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
        } else
-               pte += IOMMU_PTE_L0_INDEX(address);
+               pte += PM_LEVEL_INDEX(0, address);
+
+       update_domain(&dom->domain);
 
        return pte;
 }
@@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
        if (!pte)
                return;
 
-       pte += IOMMU_PTE_L0_INDEX(address);
+       pte += PM_LEVEL_INDEX(0, address);
 
        WARN_ON(!*pte);
 
@@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain)
        write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 }
 
-static int amd_iommu_domain_init(struct iommu_domain *dom)
+static void protection_domain_free(struct protection_domain *domain)
+{
+       if (!domain)
+               return;
+
+       if (domain->id)
+               domain_id_free(domain->id);
+
+       kfree(domain);
+}
+
+static struct protection_domain *protection_domain_alloc(void)
 {
        struct protection_domain *domain;
 
        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
        if (!domain)
-               return -ENOMEM;
+               return NULL;
 
        spin_lock_init(&domain->lock);
-       domain->mode = PAGE_MODE_3_LEVEL;
        domain->id = domain_id_alloc();
        if (!domain->id)
+               goto out_err;
+
+       return domain;
+
+out_err:
+       kfree(domain);
+
+       return NULL;
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+       struct protection_domain *domain;
+
+       domain = protection_domain_alloc();
+       if (!domain)
                goto out_free;
+
+       domain->mode    = PAGE_MODE_3_LEVEL;
        domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
        if (!domain->pt_root)
                goto out_free;
@@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
        return 0;
 
 out_free:
-       kfree(domain);
+       protection_domain_free(domain);
 
        return -ENOMEM;
 }
@@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
        paddr &= PAGE_MASK;
 
        for (i = 0; i < npages; ++i) {
-               ret = iommu_map_page(domain, iova, paddr, prot);
+               ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
                if (ret)
                        return ret;
 
@@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
        iova  &= PAGE_MASK;
 
        for (i = 0; i < npages; ++i) {
-               iommu_unmap_page(domain, iova);
+               iommu_unmap_page(domain, iova, PM_MAP_4k);
                iova  += PAGE_SIZE;
        }
 
@@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
        phys_addr_t paddr;
        u64 *pte;
 
-       pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
-
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return 0;
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
-
-       if (!IOMMU_PTE_PRESENT(*pte))
-               return 0;
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+       pte = fetch_pte(domain, iova, PM_MAP_4k);
 
-       if (!IOMMU_PTE_PRESENT(*pte))
+       if (!pte || !IOMMU_PTE_PRESENT(*pte))
                return 0;
 
        paddr  = *pte & IOMMU_PAGE_MASK;
@@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = {
        .domain_has_cap = amd_iommu_domain_has_cap,
 };
 
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+int __init amd_iommu_init_passthrough(void)
+{
+       struct pci_dev *dev = NULL;
+       u16 devid, devid2;
+
+       /* allocate passthroug domain */
+       pt_domain = protection_domain_alloc();
+       if (!pt_domain)
+               return -ENOMEM;
+
+       pt_domain->mode |= PAGE_MODE_NONE;
+
+       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+               struct amd_iommu *iommu;
+
+               devid = calc_devid(dev->bus->number, dev->devfn);
+               if (devid > amd_iommu_last_bdf)
+                       continue;
+
+               devid2 = amd_iommu_alias_table[devid];
+
+               iommu = amd_iommu_rlookup_table[devid2];
+               if (!iommu)
+                       continue;
+
+               __attach_device(iommu, pt_domain, devid);
+               __attach_device(iommu, pt_domain, devid2);
+       }
+
+       pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
+
+       return 0;
+}
index c1b17e9..b4b61d4 100644 (file)
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 /* Function to enable the hardware */
 static void iommu_enable(struct amd_iommu *iommu)
 {
-       printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+       printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
               dev_name(&iommu->dev->dev), iommu->cap_ptr);
 
        iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 }
 
 /*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+       iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+       writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+       writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+       iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
+/*
  * This function writes the command buffer address to the hardware and
  * enables it.
  */
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
        memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
                    &entry, sizeof(entry));
 
-       /* set head and tail to zero manually */
-       writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-       writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-       iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+       amd_iommu_reset_cmd_buffer(iommu);
 }
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
                switch (*p) {
                case ACPI_IVHD_TYPE:
 
-                       DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+                       DUMP_printk("device: %02x:%02x.%01x cap: %04x "
                                    "seg: %d flags: %01x info %04x\n",
                                    PCI_BUS(h->devid), PCI_SLOT(h->devid),
                                    PCI_FUNC(h->devid), h->cap_ptr,
@@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 
        r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
                        IRQF_SAMPLE_RANDOM,
-                       "AMD IOMMU",
+                       "AMD-Vi",
                        NULL);
 
        if (r) {
@@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void)
 
 
        if (no_iommu) {
-               printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+               printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
                return 0;
        }
 
@@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void)
        if (ret)
                goto free;
 
-       ret = amd_iommu_init_dma_ops();
+       if (iommu_pass_through)
+               ret = amd_iommu_init_passthrough();
+       else
+               ret = amd_iommu_init_dma_ops();
        if (ret)
                goto free;
 
        enable_iommus();
 
-       printk(KERN_INFO "AMD IOMMU: device isolation ");
+       if (iommu_pass_through)
+               goto out;
+
+       printk(KERN_INFO "AMD-Vi: device isolation ");
        if (amd_iommu_isolate)
                printk("enabled\n");
        else
                printk("disabled\n");
 
        if (amd_iommu_unmap_flush)
-               printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+               printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
        else
-               printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+               printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
 
 out:
        return ret;
index b3025b4..db72202 100644 (file)
@@ -39,7 +39,7 @@
 int unknown_nmi_panic;
 int nmi_watchdog_enabled;
 
-static cpumask_var_t backtrace_mask;
+static cpumask_t backtrace_mask __read_mostly;
 
 /* nmi_active:
  * >0: the lapic NMI watchdog is active, but can be disabled
@@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void)
        if (!prev_nmi_count)
                goto error;
 
-       alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO);
        printk(KERN_INFO "Testing NMI watchdog ... ");
 
 #ifdef CONFIG_SMP
@@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
        }
 
        /* We can be called before check_nmi_watchdog, hence NULL check. */
-       if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) {
+       if (cpumask_test_cpu(cpu, &backtrace_mask)) {
                static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
 
                spin_lock(&lock);
                printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+               show_regs(regs);
                dump_stack();
                spin_unlock(&lock);
-               cpumask_clear_cpu(cpu, backtrace_mask);
+               cpumask_clear_cpu(cpu, &backtrace_mask);
+
+               rc = 1;
        }
 
        /* Could check oops_in_progress here too, but it's safer not to */
@@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu)
        return 0;
 }
 
-void __trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(void)
 {
        int i;
 
-       cpumask_copy(backtrace_mask, cpu_online_mask);
+       cpumask_copy(&backtrace_mask, cpu_online_mask);
+
+       printk(KERN_INFO "sending NMI to all CPUs:\n");
+       apic->send_IPI_all(NMI_VECTOR);
+
        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
        for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(backtrace_mask))
+               if (cpumask_empty(&backtrace_mask))
                        break;
                mdelay(1);
        }
index 898ecc4..4a6aeed 100644 (file)
@@ -3,6 +3,7 @@
  * This code generates raw asm output which is post-processed to extract
  * and format the required data.
  */
+#define COMPILE_OFFSETS
 
 #include <linux/crypto.h>
 #include <linux/sched.h> 
index 900332b..f9cd084 100644 (file)
@@ -6,6 +6,7 @@
  *  Copyright (C) 2009 Jaswinder Singh Rajput
  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -20,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
+#include <linux/cpu.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
 
 static u64 perf_counter_mask __read_mostly;
 
+/* The maximal number of PEBS counters: */
+#define MAX_PEBS_COUNTERS      4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE                24
+
+/* The size of a per-cpu BTS buffer in bytes: */
+#define BTS_BUFFER_SIZE                (BTS_RECORD_SIZE * 1024)
+
+/* The BTS overflow threshold in bytes from the end of the buffer: */
+#define BTS_OVFL_TH            (BTS_RECORD_SIZE * 64)
+
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR                        (1 << 6)
+#define X86_DEBUGCTL_BTS               (1 << 7)
+#define X86_DEBUGCTL_BTINT             (1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS                (1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR       (1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+       u64     bts_buffer_base;
+       u64     bts_index;
+       u64     bts_absolute_maximum;
+       u64     bts_interrupt_threshold;
+       u64     pebs_buffer_base;
+       u64     pebs_index;
+       u64     pebs_absolute_maximum;
+       u64     pebs_interrupt_threshold;
+       u64     pebs_counter_reset[MAX_PEBS_COUNTERS];
+};
+
 struct cpu_hw_counters {
        struct perf_counter     *counters[X86_PMC_IDX_MAX];
        unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           interrupts;
        int                     enabled;
+       struct debug_store      *ds;
 };
 
 /*
@@ -58,6 +100,8 @@ struct x86_pmu {
        int             apic;
        u64             max_period;
        u64             intel_ctrl;
+       void            (*enable_bts)(u64 config);
+       void            (*disable_bts)(void);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter,
        u64 prev_raw_count, new_raw_count;
        s64 delta;
 
+       if (idx == X86_PMC_IDX_FIXED_BTS)
+               return 0;
+
        /*
         * Careful: an NMI might modify the previous counter value.
         *
@@ -666,10 +713,110 @@ static void release_pmc_hardware(void)
 #endif
 }
 
+static inline bool bts_available(void)
+{
+       return x86_pmu.enable_bts != NULL;
+}
+
+static inline void init_debug_store_on_cpu(int cpu)
+{
+       struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+       if (!ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+                    (u32)((u64)(unsigned long)ds),
+                    (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static inline void fini_debug_store_on_cpu(int cpu)
+{
+       if (!per_cpu(cpu_hw_counters, cpu).ds)
+               return;
+
+       wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_bts_hardware(void)
+{
+       int cpu;
+
+       if (!bts_available())
+               return;
+
+       get_online_cpus();
+
+       for_each_online_cpu(cpu)
+               fini_debug_store_on_cpu(cpu);
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+
+               if (!ds)
+                       continue;
+
+               per_cpu(cpu_hw_counters, cpu).ds = NULL;
+
+               kfree((void *)(unsigned long)ds->bts_buffer_base);
+               kfree(ds);
+       }
+
+       put_online_cpus();
+}
+
+static int reserve_bts_hardware(void)
+{
+       int cpu, err = 0;
+
+       if (!bts_available())
+               return 0;
+
+       get_online_cpus();
+
+       for_each_possible_cpu(cpu) {
+               struct debug_store *ds;
+               void *buffer;
+
+               err = -ENOMEM;
+               buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+               if (unlikely(!buffer))
+                       break;
+
+               ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+               if (unlikely(!ds)) {
+                       kfree(buffer);
+                       break;
+               }
+
+               ds->bts_buffer_base = (u64)(unsigned long)buffer;
+               ds->bts_index = ds->bts_buffer_base;
+               ds->bts_absolute_maximum =
+                       ds->bts_buffer_base + BTS_BUFFER_SIZE;
+               ds->bts_interrupt_threshold =
+                       ds->bts_absolute_maximum - BTS_OVFL_TH;
+
+               per_cpu(cpu_hw_counters, cpu).ds = ds;
+               err = 0;
+       }
+
+       if (err)
+               release_bts_hardware();
+       else {
+               for_each_online_cpu(cpu)
+                       init_debug_store_on_cpu(cpu);
+       }
+
+       put_online_cpus();
+
+       return err;
+}
+
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
        if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
                release_pmc_hardware();
+               release_bts_hardware();
                mutex_unlock(&pmc_reserve_mutex);
        }
 }
@@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
        return 0;
 }
 
+static void intel_pmu_enable_bts(u64 config)
+{
+       unsigned long debugctlmsr;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr |= X86_DEBUGCTL_TR;
+       debugctlmsr |= X86_DEBUGCTL_BTS;
+       debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+       if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+               debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+       update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       unsigned long debugctlmsr;
+
+       if (!cpuc->ds)
+               return;
+
+       debugctlmsr = get_debugctlmsr();
+
+       debugctlmsr &=
+               ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+                 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+       update_debugctlmsr(debugctlmsr);
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        err = 0;
        if (!atomic_inc_not_zero(&active_counters)) {
                mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
-                       err = -EBUSY;
-               else
+               if (atomic_read(&active_counters) == 0) {
+                       if (!reserve_pmc_hardware())
+                               err = -EBUSY;
+                       else
+                               err = reserve_bts_hardware();
+               }
+               if (!err)
                        atomic_inc(&active_counters);
                mutex_unlock(&pmc_reserve_mutex);
        }
@@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        if (config == -1LL)
                return -EINVAL;
 
+       /*
+        * Branch tracing:
+        */
+       if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+    &nbs