Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
Linus Torvalds [Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)]
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (30 commits)
  sched: Change wait_for_completion_*_timeout() to return a signed long
  sched, autogroup: Fix reference leak
  sched, autogroup: Fix potential access to freed memory
  sched: Remove redundant CONFIG_CGROUP_SCHED ifdef
  sched: Fix interactivity bug by charging unaccounted run-time on entity re-weight
  sched: Move periodic share updates to entity_tick()
  printk: Use this_cpu_{read|write} api on printk_pending
  sched: Make pushable_tasks CONFIG_SMP dependant
  sched: Add 'autogroup' scheduling feature: automated per session task groups
  sched: Fix unregister_fair_sched_group()
  sched: Remove unused argument dest_cpu to migrate_task()
  mutexes, sched: Introduce arch_mutex_cpu_relax()
  sched: Add some clock info to sched_debug
  cpu: Remove incorrect BUG_ON
  cpu: Remove unused variable
  sched: Fix UP build breakage
  sched: Make task dump print all 15 chars of proc comm
  sched: Update tg->shares after cpu.shares write
  sched: Allow update_cfs_load() to update global load
  sched: Implement demand based update_cfs_load()
  ...

238 files changed:
CREDITS
Documentation/RCU/trace.txt
Documentation/dontdiff
Documentation/kernel-docs.txt
Documentation/kernel-parameters.txt
Documentation/trace/events-power.txt [new file with mode: 0644]
MAINTAINERS
arch/alpha/include/asm/perf_event.h
arch/alpha/kernel/irq_alpha.c
arch/alpha/kernel/perf_event.c
arch/arm/kernel/perf_event.c
arch/mips/kernel/perf_event_mipsxx.c
arch/powerpc/kernel/e500-pmu.c
arch/powerpc/kernel/mpc7450-pmu.c
arch/powerpc/kernel/perf_event.c
arch/powerpc/kernel/perf_event_fsl_emb.c
arch/powerpc/kernel/power4-pmu.c
arch/powerpc/kernel/power5+-pmu.c
arch/powerpc/kernel/power5-pmu.c
arch/powerpc/kernel/power6-pmu.c
arch/powerpc/kernel/power7-pmu.c
arch/powerpc/kernel/ppc970-pmu.c
arch/sh/kernel/cpu/sh4/perf_event.c
arch/sh/kernel/cpu/sh4a/perf_event.c
arch/sh/kernel/perf_event.c
arch/sparc/include/asm/perf_event.h
arch/sparc/kernel/nmi.c
arch/sparc/kernel/perf_event.c
arch/x86/include/asm/alternative.h
arch/x86/include/asm/irq.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/nmi.h
arch/x86/include/asm/perf_event.h
arch/x86/include/asm/perf_event_p4.h
arch/x86/include/asm/smpboot_hooks.h
arch/x86/include/asm/stacktrace.h
arch/x86/include/asm/timer.h
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/Makefile
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/hw_nmi.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/nmi.c [deleted file]
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/cpu/perf_event_amd.c
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/kernel/cpu/perfctr-watchdog.c
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/kprobes.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/stacktrace.c
arch/x86/kernel/time.c
arch/x86/kernel/traps.c
arch/x86/mm/kmemcheck/error.c
arch/x86/oprofile/backtrace.c
arch/x86/oprofile/nmi_int.c
arch/x86/oprofile/nmi_timer_int.c
arch/x86/oprofile/op_model_amd.c
arch/x86/oprofile/op_model_p4.c
drivers/acpi/acpica/nsinit.c
drivers/cpufreq/cpufreq.c
drivers/cpuidle/cpuidle.c
drivers/idle/intel_idle.c
drivers/watchdog/hpwdt.c
fs/gfs2/bmap.c
fs/gfs2/glock.c
fs/gfs2/glock.h
fs/gfs2/glops.c
fs/gfs2/incore.h
fs/gfs2/inode.c
fs/gfs2/lock_dlm.c
fs/gfs2/ops_inode.c
fs/gfs2/quota.c
fs/gfs2/rgrp.c
fs/gfs2/rgrp.h
fs/gfs2/xattr.c
include/linux/ftrace_event.h
include/linux/init_task.h
include/linux/kprobes.h
include/linux/nmi.h
include/linux/perf_event.h
include/linux/rculist.h
include/linux/rcupdate.h
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/sched.h
include/linux/stacktrace.h
include/linux/syscalls.h
include/linux/tracepoint.h
include/trace/define_trace.h
include/trace/events/power.h
include/trace/events/syscalls.h
include/trace/ftrace.h
init/Kconfig
init/main.c
kernel/futex.c
kernel/hw_breakpoint.c
kernel/kprobes.c
kernel/perf_event.c
kernel/power/suspend.c
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
kernel/sched.c
kernel/srcu.c
kernel/sysctl.c
kernel/sysctl_binary.c
kernel/trace/Kconfig
kernel/trace/power-traces.c
kernel/trace/trace_event_perf.c
kernel/trace/trace_events.c
kernel/trace/trace_export.c
kernel/watchdog.c
lib/Kconfig.debug
scripts/Makefile.build
scripts/kernel-doc
tools/perf/Documentation/perf-annotate.txt
tools/perf/Documentation/perf-buildid-list.txt
tools/perf/Documentation/perf-diff.txt
tools/perf/Documentation/perf-kvm.txt
tools/perf/Documentation/perf-lock.txt
tools/perf/Documentation/perf-probe.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-sched.txt
tools/perf/Documentation/perf-script-perl.txt [moved from tools/perf/Documentation/perf-trace-perl.txt with 90% similarity]
tools/perf/Documentation/perf-script-python.txt [moved from tools/perf/Documentation/perf-trace-python.txt with 89% similarity]
tools/perf/Documentation/perf-script.txt [moved from tools/perf/Documentation/perf-trace.txt with 62% similarity]
tools/perf/Documentation/perf-stat.txt
tools/perf/Documentation/perf-test.txt
tools/perf/Documentation/perf-timechart.txt
tools/perf/Documentation/perf-top.txt
tools/perf/MANIFEST
tools/perf/Makefile
tools/perf/bench/mem-memcpy-arch.h [new file with mode: 0644]
tools/perf/bench/mem-memcpy-x86-64-asm-def.h [new file with mode: 0644]
tools/perf/bench/mem-memcpy-x86-64-asm.S [new file with mode: 0644]
tools/perf/bench/mem-memcpy.c
tools/perf/builtin-annotate.c
tools/perf/builtin-buildid-list.c
tools/perf/builtin-diff.c
tools/perf/builtin-inject.c
tools/perf/builtin-kmem.c
tools/perf/builtin-lock.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-script.c [moved from tools/perf/builtin-trace.c with 83% similarity]
tools/perf/builtin-stat.c
tools/perf/builtin-test.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin.h
tools/perf/command-list.txt
tools/perf/feature-tests.mak
tools/perf/perf.c
tools/perf/scripts/perl/Perf-Trace-Util/Context.c
tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
tools/perf/scripts/perl/Perf-Trace-Util/README
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
tools/perf/scripts/perl/bin/failed-syscalls-report
tools/perf/scripts/perl/bin/rw-by-file-report
tools/perf/scripts/perl/bin/rw-by-pid-report
tools/perf/scripts/perl/bin/rwtop-report
tools/perf/scripts/perl/bin/wakeup-latency-report
tools/perf/scripts/perl/bin/workqueue-stats-report
tools/perf/scripts/perl/check-perf-trace.pl
tools/perf/scripts/perl/rw-by-file.pl
tools/perf/scripts/perl/workqueue-stats.pl
tools/perf/scripts/python/Perf-Trace-Util/Context.c
tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py
tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
tools/perf/scripts/python/bin/futex-contention-report
tools/perf/scripts/python/bin/netdev-times-report
tools/perf/scripts/python/bin/sched-migration-report
tools/perf/scripts/python/bin/sctop-report
tools/perf/scripts/python/bin/syscall-counts-by-pid-report
tools/perf/scripts/python/bin/syscall-counts-report
tools/perf/scripts/python/check-perf-trace.py
tools/perf/scripts/python/failed-syscalls-by-pid.py
tools/perf/scripts/python/sched-migration.py
tools/perf/scripts/python/sctop.py
tools/perf/scripts/python/syscall-counts-by-pid.py
tools/perf/scripts/python/syscall-counts.py
tools/perf/util/build-id.c
tools/perf/util/cpumap.c
tools/perf/util/cpumap.h
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/event.c
tools/perf/util/event.h
tools/perf/util/evsel.c [new file with mode: 0644]
tools/perf/util/evsel.h [new file with mode: 0644]
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/hist.c
tools/perf/util/hist.h
tools/perf/util/include/asm/cpufeature.h [new file with mode: 0644]
tools/perf/util/include/asm/dwarf2.h [new file with mode: 0644]
tools/perf/util/include/linux/bitops.h
tools/perf/util/include/linux/linkage.h [new file with mode: 0644]
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-options.h
tools/perf/util/probe-event.c
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/session.h
tools/perf/util/sort.c
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/thread.c
tools/perf/util/thread.h
tools/perf/util/trace-event-info.c
tools/perf/util/trace-event.h
tools/perf/util/ui/util.c
tools/perf/util/util.c
tools/perf/util/util.h
tools/perf/util/xyarray.c [new file with mode: 0644]
tools/perf/util/xyarray.h [new file with mode: 0644]

diff --git a/CREDITS b/CREDITS
index 41d8e63..494b6e4 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2365,8 +2365,6 @@ E: acme@redhat.com
 W: http://oops.ghostprotocols.net:81/blog/
 P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD  841A B6AB 4681 9224 DF01
 D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks
-S: R. Brasílio Itiberê, 4270/1010 - Água Verde
-S: 80240-060 - Curitiba - Paraná
 S: Brazil
 
 N: Karsten Merker
index a851118..6a8c73f 100644 (file)
@@ -1,18 +1,22 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
 
 
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
 
 
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
 
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
 
 The output of "cat rcu/rcudata" looks as follows:
 
@@ -130,7 +134,8 @@ o   "ci" is the number of RCU callbacks that have been invoked for
        been registered in absence of CPU-hotplug activity.
 
 o      "co" is the number of RCU callbacks that have been orphaned due to
-       this CPU going offline.
+       this CPU going offline.  These orphaned callbacks have been moved
+       to an arbitrarily chosen online CPU.
 
 o      "ca" is the number of RCU callbacks that have been adopted due to
        other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started.  It is
 
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
 
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o  "fqlh" is the number of calls to force_quiescent_state() that
        exited immediately (without even being counted in nfqs above)
        due to contention on ->fqslock.
 
-o      "oqlen" is the number of callbacks on the "orphan" callback
-       list.  RCU callbacks are placed on this list by CPUs going
-       offline, and are "adopted" either by the CPU helping the outgoing
-       CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o      Each element of the form "1/1 0:127 ^0" represents one struct
        rcu_node.  Each line represents one level of the hierarchy, from
        root to leaves.  It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing.  Alert
        readers will note that the rcu "nn" number for a given CPU very
        closely matches the rcu_bh "np" number for that same CPU.  This
        is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o      "qlen" is the number of RCU callbacks currently waiting either
+       for an RCU grace period or waiting to be invoked.  This is the
+       only field present for rcu_sched and rcu_bh, due to the
+       short-circuiting of grace period in those two cases.
+
+o      "gp" is the number of grace periods that have completed.
+
+o      "g197/p197/c197" displays the grace-period state, with the
+       "g" number being the number of grace periods that have started
+       (mod 256), the "p" number being the number of grace periods
+       that the CPU has responded to (also mod 256), and the "c"
+       number being the number of grace periods that have completed
+       (once again mode 256).
+
+       Why have both "gp" and "g"?  Because the data flowing into
+       "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o      "tasks" is a set of bits.  The first bit is "T" if there are
+       currently tasks that have recently blocked within an RCU
+       read-side critical section, the second bit is "N" if any of the
+       aforementioned tasks are blocking the current RCU grace period,
+       and the third bit is "E" if any of the aforementioned tasks are
+       blocking the current expedited grace period.  Each bit is "."
+       if the corresponding condition does not hold.
+
+o      "ttb" is a single bit.  It is "B" if any of the blocked tasks
+       need to be priority boosted and "." otherwise.
+
+o      "btg" indicates whether boosting has been carried out during
+       the current grace period, with "exp" indicating that boosting
+       is in progress for an expedited grace period, "no" indicating
+       that boosting has not yet started for a normal grace period,
+       "begun" indicating that boosting has bebug for a normal grace
+       period, and "done" indicating that boosting has completed for
+       a normal grace period.
+
+o      "ntb" is the total number of tasks subjected to RCU priority boosting
+       periods since boot.
+
+o      "neb" is the number of expedited grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "nnb" is the number of normal grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o      "bt" is the low-order 12 bits of the value that the jiffies counter
+       will have at the next time that boosting is scheduled to begin.
+
+o      In the line beginning with "normal balk", the fields are as follows:
+
+       o       "nt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+               Note that the system will balk from boosting even if the
+               grace period is overdue when the currently running task
+               is looping within an RCU read-side critical section.
+               There is no point in boosting in this case, because
+               boosting a running task won't make it run any faster.
+
+       o       "gt" is the number of times that the system balked
+               from boosting because, although there were blocked tasks,
+               none of them were preventing the current grace period
+               from completing.
+
+       o       "bt" is the number of times that the system balked
+               from boosting because boosting was already in progress.
+
+       o       "b" is the number of times that the system balked from
+               boosting because boosting had already completed for
+               the grace period in question.
+
+       o       "ny" is the number of times that the system balked from
+               boosting because it was not yet time to start boosting
+               the grace period in question.
+
+       o       "nos" is the number of times that the system balked from
+               boosting for inexplicable ("not otherwise specified")
+               reasons.  This can actually happen due to races involving
+               increments of the jiffies counter.
+
+o      In the line beginning with "exp balk", the fields are as follows:
+
+       o       "bt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+
+       o       "nos" is the number of times that the system balked from
+                boosting for inexplicable ("not otherwise specified")
+                reasons.
index d9bcffd..470d3db 100644 (file)
@@ -62,6 +62,10 @@ aic7*reg_print.c*
 aic7*seq.h*
 aicasm
 aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
 build
 bvmlinux
 bzImage*
+capflags.c
 classlist.h*
 comp*.log
 compile.h*
@@ -94,6 +99,7 @@ devlist.h*
 docproc
 elf2ecoff
 elfconfig.h*
+evergreen_reg_safe.h
 fixdep
 flask.h
 fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
 *_gray256.c
 ihex2fw
 ikconfig.h*
+inat-tables.c
 initramfs_data.cpio
 initramfs_data.cpio.gz
 initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
 kallsyms
 kconfig
 keywords.c
@@ -140,6 +153,7 @@ mkprep
 mktables
 mktree
 modpost
+modules.builtin
 modules.order
 modversions.h*
 ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
 pca200e_ecd.bin2
 piggy.gz
 piggyback
+piggy.S
 pnmtologo
 ppc_defs.h*
 pss_boot.h
 qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
 raid6altivec*.c
 raid6int*.c
 raid6tables.c
 relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
 series
 setup
 setup.bin
@@ -169,6 +192,7 @@ sImage
 sm_tbl*
 split-include
 syscalltab.h
+tables.c
 tags
 tftpboot.img
 timeconst.h
@@ -190,6 +214,7 @@ vmlinux
 vmlinux-*
 vmlinux.aout
 vmlinux.lds
+voffset.h
 vsyscall.lds
 vsyscall_32.lds
 wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
 wakeup.lds
 zImage*
 zconf.hash.c
+zoffset.h
index 715eaaf..9a86746 100644 (file)
        Notes: Further information in
        http://www.oreilly.com/catalog/linuxdrive2/
 
-     * Title: "Linux Device Drivers, 3nd Edition"
+     * Title: "Linux Device Drivers, 3rd Edition"
        Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
        Publisher: O'Reilly & Associates.
        Date: 2005.
        Pages: 600.
        ISBN: 0-13-101908-2
 
-     * Title:  "The  Design  and Implementation of the 4.4 BSD UNIX
-       Operating System"
-       Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
-       John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1996.
-       ISBN: 0-201-54979-4
-
      * Title: "Programming for the real world - POSIX.4"
        Author: Bill O. Gallmeister.
        Publisher: O'Reilly & Associates, Inc..
        POSIX. Good reference.
 
      * Title:  "UNIX  Systems  for  Modern Architectures: Symmetric
-       Multiprocesssing and Caching for Kernel Programmers"
+       Multiprocessing and Caching for Kernel Programmers"
        Author: Curt Schimmel.
        Publisher: Addison Wesley.
        Date: June, 1994.
        Pages: 432.
        ISBN: 0-201-63338-8
 
-     * Title:  "The  Design  and Implementation of the 4.3 BSD UNIX
-       Operating System"
-       Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
-       Karels, John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1989 (reprinted with corrections on October, 1990).
-       ISBN: 0-201-06196-1
-
-     * Title: "The Design of the UNIX Operating System"
-       Author: Maurice J. Bach.
-       Publisher: Prentice Hall.
-       Date: 1986.
-       Pages: 471.
-       ISBN: 0-13-201757-1
-
      MISCELLANEOUS:
 
      * Name: linux/Documentation
index 1031923..d6496fd 100644 (file)
@@ -1579,20 +1579,12 @@ and is between 256 and 4096 characters. It is defined in the file
 
        nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                        Format: [panic,][num]
-                       Valid num: 0,1,2
+                       Valid num: 0
                        0 - turn nmi_watchdog off
-                       1 - use the IO-APIC timer for the NMI watchdog
-                       2 - use the local APIC for the NMI watchdog using
-                       a performance counter. Note: This will use one
-                       performance counter and the local APIC's performance
-                       vector.
                        When panic is specified, panic when an NMI watchdog
                        timeout occurs.
                        This is useful when you use a panic=... timeout and
                        need the box quickly up again.
-                       Instead of 1 and 2 it is possible to use the following
-                       symbolic names: lapic and ioapic
-                       Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
 
        netpoll.carrier_timeout=
                        [NET] Specifies amount of time (in seconds) that
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt
new file mode 100644 (file)
index 0000000..96d87b6
--- /dev/null
@@ -0,0 +1,90 @@
+
+                       Subsystem Trace Points: power
+
+The power tracing system captures events related to power transitions
+within the kernel. Broadly speaking there are three major subheadings:
+
+  o Power state switch which reports events related to suspend (S-states),
+     cpuidle (C-states) and cpufreq (P-states)
+  o System clock related changes
+  o Power domains related changes and transitions
+
+This document describes what each of the tracepoints is and why they
+might be useful.
+
+Cf. include/trace/events/power.h for the events definitions.
+
+1. Power state switch events
+============================
+
+1.1 New trace API
+-----------------
+
+A 'cpu' event class gathers the CPU-related events: cpuidle and
+cpufreq.
+
+cpu_idle               "state=%lu cpu_id=%lu"
+cpu_frequency          "state=%lu cpu_id=%lu"
+
+A suspend event is used to indicate the system going in and out of the
+suspend mode:
+
+machine_suspend                "state=%lu"
+
+
+Note: the value of '-1' or '4294967295' for state means an exit from the current state,
+i.e. trace_cpu_idle(4, smp_processor_id()) means that the system
+enters the idle state 4, while trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id())
+means that the system exits the previous idle state.
+
+The event which has 'state=4294967295' in the trace is very important to the user
+space tools which are using it to detect the end of the current state, and so to
+correctly draw the states diagrams and to calculate accurate statistics etc.
+
+1.2 DEPRECATED trace API
+------------------------
+
+A new Kconfig option CONFIG_EVENT_POWER_TRACING_DEPRECATED with the default value of
+'y' has been created. This allows the legacy trace power API to be used conjointly
+with the new trace API.
+The Kconfig option, the old trace API (in include/trace/events/power.h) and the
+old trace points will disappear in a future release (namely 2.6.41).
+
+power_start            "type=%lu state=%lu cpu_id=%lu"
+power_frequency                "type=%lu state=%lu cpu_id=%lu"
+power_end              "cpu_id=%lu"
+
+The 'type' parameter takes one of those macros:
+ . POWER_NONE  = 0,
+ . POWER_CSTATE        = 1,    /* C-State */
+ . POWER_PSTATE        = 2,    /* Fequency change or DVFS */
+
+The 'state' parameter is set depending on the type:
+ . Target C-state for type=POWER_CSTATE,
+ . Target frequency for type=POWER_PSTATE,
+
+power_end is used to indicate the exit of a state, corresponding to the latest
+power_start event.
+
+2. Clocks events
+================
+The clock events are used for clock enable/disable and for
+clock rate change.
+
+clock_enable           "%s state=%lu cpu_id=%lu"
+clock_disable          "%s state=%lu cpu_id=%lu"
+clock_set_rate         "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the clock name (e.g. "gpio1_iclk").
+The second parameter is '1' for enable, '0' for disable, the target
+clock rate for set_rate.
+
+3. Power domains events
+=======================
+The power domain events are used for power domains transitions
+
+power_domain_target    "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the power domain name (e.g. "mpu_pwrdm").
+The second parameter is the power domain target state.
+
index 7585e9d..b1dda78 100644 (file)
@@ -4627,7 +4627,7 @@ PERFORMANCE EVENTS SUBSYSTEM
 M:     Peter Zijlstra <a.p.zijlstra@chello.nl>
 M:     Paul Mackerras <paulus@samba.org>
 M:     Ingo Molnar <mingo@elte.hu>
-M:     Arnaldo Carvalho de Melo <acme@redhat.com>
+M:     Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 S:     Supported
 F:     kernel/perf_event*.c
 F:     include/linux/perf_event.h
index fe792ca..5996e7a 100644 (file)
@@ -1,10 +1,4 @@
 #ifndef __ASM_ALPHA_PERF_EVENT_H
 #define __ASM_ALPHA_PERF_EVENT_H
 
-#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
-#else
-static inline void init_hw_perf_events(void)    { }
-#endif
-
 #endif /* __ASM_ALPHA_PERF_EVENT_H */
index 5f77afb..4c8bb37 100644 (file)
@@ -112,8 +112,6 @@ init_IRQ(void)
        wrent(entInt, 0);
 
        alpha_mv.init_irq();
-
-       init_hw_perf_events();
 }
 
 /*
index 1cc4968..90561c4 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/kdebug.h>
 #include <linux/mutex.h>
+#include <linux/init.h>
 
 #include <asm/hwrpb.h>
 #include <asm/atomic.h>
@@ -863,13 +864,13 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
 /*
  * Init call to initialise performance events at kernel startup.
  */
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
 {
        pr_info("Performance events: ");
 
        if (!supported_cpu()) {
                pr_cont("No support for your CPU.\n");
-               return;
+               return 0;
        }
 
        pr_cont("Supported CPU type!\n");
@@ -881,6 +882,8 @@ void __init init_hw_perf_events(void)
        /* And set up PMU specification */
        alpha_pmu = &ev67_pmu;
 
-       perf_pmu_register(&pmu);
-}
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
 
+       return 0;
+}
+early_initcall(init_hw_perf_events);
index 07a5035..fdfa497 100644 (file)
@@ -3034,11 +3034,11 @@ init_hw_perf_events(void)
                pr_info("no hardware support available\n");
        }
 
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
 
        return 0;
 }
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
 
 /*
  * Callchain handling code.
index 5c7c6fc..183e0d2 100644 (file)
@@ -1047,6 +1047,6 @@ init_hw_perf_events(void)
 
        return 0;
 }
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
 
 #endif /* defined(CONFIG_CPU_MIPS32)... */
index 7c07de0..b150b51 100644 (file)
@@ -126,4 +126,4 @@ static int init_e500_pmu(void)
        return register_fsl_emb_pmu(&e500_pmu);
 }
 
-arch_initcall(init_e500_pmu);
+early_initcall(init_e500_pmu);
index 09d7202..2cc5e03 100644 (file)
@@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void)
        return register_power_pmu(&mpc7450_pmu);
 }
 
-arch_initcall(init_mpc7450_pmu);
+early_initcall(init_mpc7450_pmu);
index 3129c85..5674807 100644 (file)
@@ -1379,7 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
                freeze_events_kernel = MMCR0_FCHV;
 #endif /* CONFIG_PPC64 */
 
-       perf_pmu_register(&power_pmu);
+       perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
        perf_cpu_notifier(power_pmu_notifier);
 
        return 0;
index 7ecca59..4dcf5f8 100644 (file)
@@ -681,7 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
        pr_info("%s performance monitor hardware support registered\n",
                pmu->name);
 
-       perf_pmu_register(&fsl_emb_pmu);
+       perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW);
 
        return 0;
 }
index 2a361cd..ead8b3c 100644 (file)
@@ -613,4 +613,4 @@ static int init_power4_pmu(void)
        return register_power_pmu(&power4_pmu);
 }
 
-arch_initcall(init_power4_pmu);
+early_initcall(init_power4_pmu);
index 199de52..eca0ac5 100644 (file)
@@ -682,4 +682,4 @@ static int init_power5p_pmu(void)
        return register_power_pmu(&power5p_pmu);
 }
 
-arch_initcall(init_power5p_pmu);
+early_initcall(init_power5p_pmu);
index 98b6a72..d5ff0f6 100644 (file)
@@ -621,4 +621,4 @@ static int init_power5_pmu(void)
        return register_power_pmu(&power5_pmu);
 }
 
-arch_initcall(init_power5_pmu);
+early_initcall(init_power5_pmu);
index 84a607b..3160392 100644 (file)
@@ -544,4 +544,4 @@ static int init_power6_pmu(void)
        return register_power_pmu(&power6_pmu);
 }
 
-arch_initcall(init_power6_pmu);
+early_initcall(init_power6_pmu);
index 852f7b7..593740f 100644 (file)
@@ -369,4 +369,4 @@ static int init_power7_pmu(void)
        return register_power_pmu(&power7_pmu);
 }
 
-arch_initcall(init_power7_pmu);
+early_initcall(init_power7_pmu);
index 3fee685..9a6e093 100644 (file)
@@ -494,4 +494,4 @@ static int init_ppc970_pmu(void)
        return register_power_pmu(&ppc970_pmu);
 }
 
-arch_initcall(init_ppc970_pmu);
+early_initcall(init_ppc970_pmu);
index dbf3b4b..748955d 100644 (file)
@@ -250,4 +250,4 @@ static int __init sh7750_pmu_init(void)
 
        return register_sh_pmu(&sh7750_pmu);
 }
-arch_initcall(sh7750_pmu_init);
+early_initcall(sh7750_pmu_init);
index 5802765..17e6beb 100644 (file)
@@ -284,4 +284,4 @@ static int __init sh4a_pmu_init(void)
 
        return register_sh_pmu(&sh4a_pmu);
 }
-arch_initcall(sh4a_pmu_init);
+early_initcall(sh4a_pmu_init);
index 5a4b334..2ee21a4 100644 (file)
@@ -389,7 +389,7 @@ int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
 
        WARN_ON(_pmu->num_events > MAX_HWEVENTS);
 
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
        perf_cpu_notifier(sh_pmu_notifier);
        return 0;
 }
index 6e8bfa1..4d3dbe3 100644 (file)
@@ -4,8 +4,6 @@
 #ifdef CONFIG_PERF_EVENTS
 #include <asm/ptrace.h>
 
-extern void init_hw_perf_events(void);
-
 #define perf_arch_fetch_caller_regs(regs, ip)          \
 do {                                                   \
        unsigned long _pstate, _asi, _pil, _i7, _fp;    \
@@ -26,8 +24,6 @@ do {                                                  \
        (regs)->u_regs[UREG_I6] = _fp;                  \
        (regs)->u_regs[UREG_I7] = _i7;                  \
 } while (0)
-#else
-static inline void init_hw_perf_events(void)   { }
 #endif
 
 #endif
index a4bd7ba..300f810 100644 (file)
@@ -270,8 +270,6 @@ int __init nmi_init(void)
                        atomic_set(&nmi_active, -1);
                }
        }
-       if (!err)
-               init_hw_perf_events();
 
        return err;
 }
index 0d6deb5..7605786 100644 (file)
@@ -1307,20 +1307,23 @@ static bool __init supported_pmu(void)
        return false;
 }
 
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
 {
        pr_info("Performance events: ");
 
        if (!supported_pmu()) {
                pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
-               return;
+               return 0;
        }
 
        pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
 
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
        register_die_notifier(&perf_event_nmi_notifier);
+
+       return 0;
 }
+early_initcall(init_hw_perf_events);
 
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
                           struct pt_regs *regs)
index 76561d2..4a2adaa 100644 (file)
@@ -180,8 +180,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
  * On the local CPU you need to be protected again NMI or MCE handlers seeing an
  * inconsistent instruction while you patch.
  */
+struct text_poke_param {
+       void *addr;
+       const void *opcode;
+       size_t len;
+};
+
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+extern void text_poke_smp_batch(struct text_poke_param *params, int n);
 
 #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
 #define IDEAL_NOP_SIZE_5 5
index 13b0eba..ba870bb 100644 (file)
@@ -15,10 +15,6 @@ static inline int irq_canonicalize(int irq)
        return ((irq == 2) ? 9 : irq);
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-# define ARCH_HAS_NMI_WATCHDOG
-#endif
-
 #ifdef CONFIG_X86_32
 extern void irq_ctx_init(int cpu);
 #else
index 5bdfca8..f23eb25 100644 (file)
@@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_registers(struct pt_regs *regs);
 extern void show_trace(struct task_struct *t, struct pt_regs *regs,
-                      unsigned long *sp, unsigned long bp);
+                      unsigned long *sp);
 extern void __show_regs(struct pt_regs *regs, int all);
 extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
index 6b89f5e..86030f6 100644 (file)
 #define MSR_AMD64_IBSCTL               0xc001103a
 #define MSR_AMD64_IBSBRTARGET          0xc001103b
 
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL              0xc0010200
+#define MSR_F15H_PERF_CTR              0xc0010201
+
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE      0xc0010058
 #define FAM10H_MMIO_CONF_ENABLE                (1<<0)
index 932f0f8..c4021b9 100644 (file)
@@ -5,41 +5,15 @@
 #include <asm/irq.h>
 #include <asm/io.h>
 
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it.  Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
+#ifdef CONFIG_X86_LOCAL_APIC
 
 extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_LOCKUP_DETECTOR)
-extern int nmi_watchdog_enabled;
-#endif
 extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
 extern int reserve_perfctr_nmi(unsigned int);
 extern void release_perfctr_nmi(unsigned int);
 extern int reserve_evntsel_nmi(unsigned int);
 extern void release_evntsel_nmi(unsigned int);
 
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE       0
-#define NMI_IO_APIC    1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID    3
-
 struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
                        void __user *, size_t *, loff_t *);
@@ -47,33 +21,8 @@ extern int unknown_nmi_panic;
 
 void arch_trigger_all_cpu_backtrace(void);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
-
-static inline void localise_nmi_watchdog(void)
-{
-       if (nmi_watchdog == NMI_IO_APIC)
-               nmi_watchdog = NMI_LOCAL_APIC;
-}
-
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
-       /*
-        * actually it should be:
-        *      return (nmi_watchdog == NMI_LOCAL_APIC ||
-        *              nmi_watchdog == NMI_IO_APIC)
-        * but since they are power of two we could use a
-        * cheaper way --cvg
-        */
-       return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
 #endif
 
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
 void stop_nmi(void);
 void restart_nmi(void);
 
index 550e26b..d9d4dae 100644 (file)
@@ -125,7 +125,6 @@ union cpuid10_edx {
 #define IBS_OP_MAX_CNT_EXT     0x007FFFFFULL   /* not a register bit mask */
 
 #ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
 extern void perf_events_lapic_init(void);
 
 #define PERF_EVENT_INDEX_OFFSET                        0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 }
 
 #else
-static inline void init_hw_perf_events(void)           { }
 static inline void perf_events_lapic_init(void)        { }
 #endif
 
index a70cd21..295e2ff 100644 (file)
@@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS {
 };
 
 /*
- * P4 PEBS specifics (Replay Event only)
- *
- * Format (bits):
- *   0-6: metric from P4_PEBS_METRIC enum
- *    7 : reserved
- *    8 : reserved
- * 9-11 : reserved
- *
  * Note we have UOP and PEBS bits reserved for now
  * just in case if we will need them once
  */
@@ -788,5 +780,60 @@ enum P4_PEBS_METRIC {
        P4_PEBS_METRIC__max
 };
 
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ *    32 bits valuable, we pack them into a single 64 bit
+ *    configuration. Low 32 bits of such config correspond
+ *    to low 32 bits of CCCR register and high 32 bits
+ *    correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ *    be found in Intel SDM but it should be noted that
+ *    we "borrow" some reserved bits for own usage and
+ *    clean them or set to a proper value when we do
+ *    a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ *    and should be either 0 or set to some predefined
+ *    values:
+ *
+ *    Low 32 bits
+ *    -----------
+ *      0-6: P4_PEBS_METRIC enum
+ *     7-11:                    reserved
+ *       12:                    reserved (Enable)
+ *    13-15:                    reserved (ESCR select)
+ *    16-17: Active Thread
+ *       18: Compare
+ *       19: Complement
+ *    20-23: Threshold
+ *       24: Edge
+ *       25:                    reserved (FORCE_OVF)
+ *       26:                    reserved (OVF_PMI_T0)
+ *       27:                    reserved (OVF_PMI_T1)
+ *    28-29:                    reserved
+ *       30:                    reserved (Cascade)
+ *       31:                    reserved (OVF)
+ *
+ *    High 32 bits
+ *    ------------
+ *        0:                    reserved (T1_USR)
+ *        1:                    reserved (T1_OS)
+ *        2:                    reserved (T0_USR)
+ *        3:                    reserved (T0_OS)
+ *        4: Tag Enable
+ *      5-8: Tag Value
+ *     9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ *    25-30: enum P4_EVENTS
+ *       31:                    reserved (HT thread)
+ */
+
 #endif /* PERF_EVENT_P4_H */
 
index 1def601..6c22bf3 100644 (file)
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
                setup_IO_APIC();
        else {
                nr_ioapics = 0;
-               localise_nmi_watchdog();
        }
 #endif
 }
index 2b16a2a..52b5c7e 100644 (file)
@@ -7,6 +7,7 @@
 #define _ASM_X86_STACKTRACE_H
 
 #include <linux/uaccess.h>
+#include <linux/ptrace.h>
 
 extern int kstack_depth_to_print;
 
@@ -46,7 +47,7 @@ struct stacktrace_ops {
 };
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+               unsigned long *stack,
                const struct stacktrace_ops *ops, void *data);
 
 #ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
 #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
 #endif
 
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+       unsigned long bp;
+
+       if (regs)
+               return regs->bp;
+
+       if (task == current) {
+               /* Grab bp right from our regs */
+               get_bp(bp);
+               return bp;
+       }
+
+       /* bp is the last reg pushed by switch_to */
+       return *(unsigned long *)task->thread.sp;
+}
+#else
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+       return 0;
+}
+#endif
+
 extern void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl);
+                  unsigned long *stack, char *log_lvl);
 
 extern void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *sp, unsigned long bp, char *log_lvl);
+                  unsigned long *sp, char *log_lvl);
 
 extern unsigned int code_bytes;
 
index 5469630..fa7b917 100644 (file)
 unsigned long long native_sched_clock(void);
 extern int recalibrate_cpu_khz(void);
 
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-extern int timer_ack;
-#else
-# define timer_ack (0)
-#endif
-
 extern int no_timer_check;
 
 /* Accelerators for sched_clock()
index 5079f24..553d0b0 100644 (file)
@@ -591,17 +591,21 @@ static atomic_t stop_machine_first;
 static int wrote_text;
 
 struct text_poke_params {
-       void *addr;
-       const void *opcode;
-       size_t len;
+       struct text_poke_param *params;
+       int nparams;
 };
 
 static int __kprobes stop_machine_text_poke(void *data)
 {
        struct text_poke_params *tpp = data;
+       struct text_poke_param *p;
+       int i;
 
        if (atomic_dec_and_test(&stop_machine_first)) {
-               text_poke(tpp->addr, tpp->opcode, tpp->len);
+               for (i = 0; i < tpp->nparams; i++) {
+                       p = &tpp->params[i];
+                       text_poke(p->addr, p->opcode, p->len);
+               }
                smp_wmb();      /* Make sure other cpus see that this has run */
                wrote_text = 1;
        } else {
@@ -610,8 +614,12 @@ static int __kprobes stop_machine_text_poke(void *data)
                smp_mb();       /* Load wrote_text before following execution */
        }
 
-       flush_icache_range((unsigned long)tpp->addr,
-                          (unsigned long)tpp->addr + tpp->len);
+       for (i = 0; i < tpp->nparams; i++) {
+               p = &tpp->params[i];
+               flush_icache_range((unsigned long)p->addr,
+                                  (unsigned long)p->addr + p->len);
+       }
+
        return 0;
 }
 
@@ -631,10 +639,13 @@ static int __kprobes stop_machine_text_poke(void *data)
 void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
 {
        struct text_poke_params tpp;
+       struct text_poke_param p;
 
-       tpp.addr = addr;
-       tpp.opcode = opcode;
-       tpp.len = len;
+       p.addr = addr;
+       p.opcode = opcode;
+       p.len = len;
+       tpp.params = &p;
+       tpp.nparams = 1;
        atomic_set(&stop_machine_first, 1);
        wrote_text = 0;
        /* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +653,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
        return addr;
 }
 
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+       struct text_poke_params tpp = {.params = params, .nparams = n};
+
+       atomic_set(&stop_machine_first, 1);
+       wrote_text = 0;
+       stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
+
 #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
 
 #ifdef CONFIG_X86_64
index 910f20b..3966b56 100644 (file)
@@ -3,10 +3,7 @@
 #
 
 obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
-obj-$(CONFIG_X86_LOCAL_APIC)   += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR)      += hw_nmi.o
+obj-y                          += hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
 obj-$(CONFIG_SMP)              += ipi.o
index 7821813..fb76578 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/dmi.h>
-#include <linux/nmi.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
 
@@ -799,11 +798,7 @@ void __init setup_boot_APIC_clock(void)
         * PIT/HPET going.  Otherwise register lapic as a dummy
         * device.
         */
-       if (nmi_watchdog != NMI_IO_APIC)
-               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-       else
-               pr_warning("APIC timer registered as dummy,"
-                       " due to nmi_watchdog=%d!\n", nmi_watchdog);
+       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
 
        /* Setup the lapic or request the broadcast */
        setup_APIC_timer();
@@ -1387,7 +1382,6 @@ void __cpuinit end_local_APIC_setup(void)
        }
 #endif
 
-       setup_apic_nmi_watchdog(NULL);
        apic_pm_activate();
 
        /*
@@ -1758,17 +1752,10 @@ int __init APIC_init_uniprocessor(void)
                setup_IO_APIC();
        else {
                nr_ioapics = 0;
-               localise_nmi_watchdog();
        }
-#else
-       localise_nmi_watchdog();
 #endif
 
        x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
-       check_nmi_watchdog();
-#endif
-
        return 0;
 }
 
index 62f6e1e..72ec29e 100644 (file)
 #include <linux/nmi.h>
 #include <linux/module.h>
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(void)
 {
        return (u64)(cpu_khz) * 1000 * 60;
 }
+#endif
 
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
+#ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
 void arch_trigger_all_cpu_backtrace(void)
 {
        int i;
 
+       if (test_and_set_bit(0, &backtrace_flag))
+               /*
+                * If there is already a trigger_all_cpu_backtrace() in progress
+                * (backtrace_flag == 1), don't output double cpu dump infos.
+                */
+               return;
+
        cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
 
        printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -42,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void)
                        break;
                mdelay(1);
        }
+
+       clear_bit(0, &backtrace_flag);
+       smp_mb__after_clear_bit();
 }
 
 static int __kprobes
@@ -50,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
 {
        struct die_args *args = __args;
        struct pt_regs *regs;
-       int cpu = smp_processor_id();
+       int cpu;
 
        switch (cmd) {
        case DIE_NMI:
@@ -62,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
        }
 
        regs = args->regs;
+       cpu = smp_processor_id();
 
        if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
                static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -91,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
 }
 early_initcall(register_trigger_all_cpu_backtrace);
 #endif
-
-/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
index fadcd74..16c2db8 100644 (file)
@@ -54,7 +54,6 @@
 #include <asm/dma.h>
 #include <asm/timer.h>
 #include <asm/i8259.h>
-#include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
 #include <asm/setup.h>
@@ -2642,24 +2641,6 @@ static void lapic_register_intr(int irq)
                                      "edge");
 }
 
-static void __init setup_nmi(void)
-{
-       /*
-        * Dirty trick to enable the NMI watchdog ...
-        * We put the 8259A master into AEOI mode and
-        * unmask on all local APICs LVT0 as NMI.
-        *
-        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-        * is from Maciej W. Rozycki - so we do not have to EOI from
-        * the NMI handler or the timer interrupt.
-        */
-       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
-       enable_NMI_through_LVT0();
-
-       apic_printk(APIC_VERBOSE, " done.\n");
-}
-
 /*
  * This looks a bit hackish but it's about the only one way of sending
  * a few INTA cycles to 8259As and any associated glue logic.  ICR does
@@ -2765,15 +2746,6 @@ static inline void __init check_timer(void)
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        legacy_pic->init(1);
-#ifdef CONFIG_X86_32
-       {
-               unsigned int ver;
-
-               ver = apic_read(APIC_LVR);
-               ver = GET_APIC_VERSION(ver);
-               timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-       }
-#endif
 
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2821,10 +2793,6 @@ static inline void __init check_timer(void)
                                unmask_ioapic(cfg);
                }
                if (timer_irq_works()) {
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               setup_nmi();
-                               legacy_pic->unmask(0);
-                       }
                        if (disable_timer_pin_1 > 0)
                                clear_IO_APIC_pin(0, pin1);
                        goto out;
@@ -2850,11 +2818,6 @@ static inline void __init check_timer(void)
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                        timer_through_8259 = 1;
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               legacy_pic->mask(0);
-                               setup_nmi();
-                               legacy_pic->unmask(0);
-                       }
                        goto out;
                }
                /*
@@ -2866,15 +2829,6 @@ static inline void __init check_timer(void)
                apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
        }
 
-       if (nmi_watchdog == NMI_IO_APIC) {
-               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
-                           "through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = NMI_NONE;
-       }
-#ifdef CONFIG_X86_32
-       timer_ack = 0;
-#endif
-
        apic_printk(APIC_QUIET, KERN_INFO
                    "...trying to set up timer as Virtual Wire IRQ...\n");
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644 (file)
index c90041c..0000000
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
- *  Mikael Pettersson  : Pentium 4 support for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
-       return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
-       return atomic_read(&mce_entry) > 0;
-#endif
-       return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-       return per_cpu(irq_stat, cpu).apic_timer_irqs +
-               per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-       local_irq_enable_in_hardirq();
-       /*
-        * Intentionally don't use cpu_relax here. This is
-        * to make sure that the performance counter really ticks,
-        * even if there is a simulator or similar that catches the
-        * pause instruction. On a real HT machine this is fine because
-        * all other CPUs are busy with "useless" delay loops and don't
-        * care if they get somewhat less cycles.
-        */
-       while (endflag == 0)
-               mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
-       printk(KERN_CONT "\n");
-
-       printk(KERN_WARNING
-               "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
-                       cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
-       printk(KERN_WARNING
-               "Please report this to bugzilla.kernel.org,\n");
-       printk(KERN_WARNING
-               "and attach the output of the 'dmesg' command.\n");
-
-       per_cpu(wd_enabled, cpu) = 0;
-       atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
-       unsigned int *prev_nmi_count;
-       int cpu;
-
-       if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
-               return 0;
-
-       prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
-       if (!prev_nmi_count)
-               goto error;
-
-       printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
-       for_each_possible_cpu(cpu)
-               prev_nmi_count[cpu] = get_nmi_count(cpu);
-       local_irq_enable();
-       mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
-       for_each_online_cpu(cpu) {
-               if (!per_cpu(wd_enabled, cpu))
-                       continue;
-               if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
-                       report_broken_nmi(cpu, prev_nmi_count);
-       }
-       endflag = 1;
-       if (!atomic_read(&nmi_active)) {
-               kfree(prev_nmi_count);
-               atomic_set(&nmi_active, -1);
-               goto error;
-       }
-       printk("OK.\n");
-
-       /*
-        * now that we know it works we can reduce NMI frequency to
-        * something more reasonable; makes a difference in some configs
-        */
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               nmi_hz = lapic_adjust_nmi_hz(1);
-
-       kfree(prev_nmi_count);
-       return 0;
-error:
-       if (nmi_watchdog == NMI_IO_APIC) {
-               if (!timer_through_8259)
-                       legacy_pic->mask(0);
-               on_each_cpu(__acpi_nmi_disable, NULL, 1);
-       }
-
-#ifdef CONFIG_X86_32
-       timer_ack = 0;
-#endif
-       return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
-       unsigned int nmi;
-
-       if (!strncmp(str, "panic", 5)) {
-               panic_on_timeout = 1;
-               str = strchr(str, ',');
-               if (!str)
-                       return 1;
-               ++str;
-       }
-
-       if (!strncmp(str, "lapic", 5))
-               nmi_watchdog = NMI_LOCAL_APIC;
-       else if (!strncmp(str, "ioapic", 6))
-               nmi_watchdog = NMI_IO_APIC;
-       else {
-               get_option(&str, &nmi);
-               if (nmi >= NMI_INVALID)
-                       return 0;
-               nmi_watchdog = nmi;
-       }
-
-       return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       nmi_pm_active = atomic_read(&nmi_active);
-       stop_apic_nmi_watchdog(NULL);
-       BUG_ON(atomic_read(&nmi_active) != 0);
-       return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       if (nmi_pm_active > 0) {
-               setup_apic_nmi_watchdog(NULL);
-               touch_nmi_watchdog();
-       }
-       return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
-       .name           = "lapic_nmi",
-       .resume         = lapic_nmi_resume,
-       .suspend        = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-       .id     = 0,
-       .cls    = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-       int error;
-
-       /*
-        * should really be a BUG_ON but b/c this is an
-        * init call, it just doesn't work.  -dcz
-        */
-       if (nmi_watchdog != NMI_LOCAL_APIC)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0)
-               return 0;
-
-       error = sysdev_class_register(&nmi_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic_nmi);
-       return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
-       __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
-       if (__get_cpu_var(wd_enabled))
-               return;
-
-       /* cheap hack to support suspend/resume */
-       /* if cpu0 is not active neither should the other cpus */
-       if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
-               return;
-
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               if (lapic_watchdog_init(nmi_hz) < 0) {
-                       __get_cpu_var(wd_enabled) = 0;
-                       return;
-               }
-               /* FALL THROUGH */
-       case NMI_IO_APIC:
-               __get_cpu_var(wd_enabled) = 1;
-               atomic_inc(&nmi_active);
-       }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-       /* only support LOCAL and IO APICs for now */
-       if (!nmi_watchdog_active())
-               return;
-       if (__get_cpu_var(wd_enabled) == 0)
-               return;
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               lapic_watchdog_stop();
-       else
-               __acpi_nmi_disable(NULL);
-       __get_cpu_var(wd_enabled) = 0;
-       atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
-       if (nmi_watchdog_active()) {
-               unsigned cpu;
-
-               /*
-                * Tell other CPUs to reset their alert counters. We cannot
-                * do it ourselves because the alert count increase is not
-                * atomic.
-                */
-               for_each_present_cpu(cpu) {
-                       if (per_cpu(nmi_touch, cpu) != 1)
-                               per_cpu(nmi_touch, cpu) = 1;
-               }
-       }
-
-       /*
-        * Tickle the softlockup detector too:
-        */
-       touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
-       /*
-        * Since current_thread_info()-> is always on the stack, and we
-        * always switch the stack NMI-atomically, it's safe to use
-        * smp_processor_id().
-        */
-       unsigned int sum;
-       int touched = 0;
-       int cpu = smp_processor_id();
-       int rc = 0;
-
-       sum = get_timer_irqs(cpu);
-
-       if (__get_cpu_var(nmi_touch)) {
-               __get_cpu_var(nmi_touch) = 0;
-               touched = 1;
-       }
-
-       /* We can be called before check_nmi_watchdog, hence NULL check. */
-       if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-               static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
-               raw_spin_lock(&lock);
-               printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-               show_regs(regs);
-               dump_stack();
-               raw_spin_unlock(&lock);
-               cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
-               rc = 1;
-       }
-
-       /* Could check oops_in_progress here too, but it's safer not to */
-       if (mce_in_progress())
-               touched = 1;
-
-       /* if the none of the timers isn't firing, this cpu isn't doing much */
-       if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-               /*
-                * Ayiee, looks like this CPU is stuck ...
-                * wait a few IRQs (5 seconds) before doing the oops ...
-                */
-               __this_cpu_inc(alert_counter);
-               if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
-                       /*
-                        * die_nmi will return ONLY if NOTIFY_STOP happens..
-                        */
-                       die_nmi("BUG: NMI Watchdog detected LOCKUP",
-                               regs, panic_on_timeout);
-       } else {
-               __get_cpu_var(last_irq_sum) = sum;
-               __this_cpu_write(alert_counter, 0);
-       }
-
-       /* see if the nmi watchdog went off */
-       if (!__get_cpu_var(wd_enabled))
-               return rc;
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               rc |= lapic_wd_event(nmi_hz);
-               break;
-       case NMI_IO_APIC:
-               /*
-                * don't know how to accurately check for this.
-                * just assume it was a watchdog timer interrupt
-                * This matches the old behaviour.
-                */
-               rc = 1;
-               break;
-       }
-       return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
-       __get_cpu_var(wd_enabled) = 1;
-       atomic_inc(&nmi_active);
-       __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
-       on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
-       touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
-       on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
-       unknown_nmi_panic = 1;
-       return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-       unsigned char reason = get_nmi_reason();
-       char buf[64];
-
-       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-       die_nmi(buf, regs, 1); /* Always panic here */
-       return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
-                       void __user *buffer, size_t *length, loff_t *ppos)
-{
-       int old_state;
-
-       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-       old_state = nmi_watchdog_enabled;
-       proc_dointvec(table, write, buffer, length, ppos);
-       if (!!old_state == !!nmi_watchdog_enabled)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
-               printk(KERN_WARNING
-                       "NMI watchdog is permanently disabled\n");
-               return -EIO;
-       }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_lapic_nmi_watchdog();
-               else
-                       disable_lapic_nmi_watchdog();
-       } else if (nmi_watchdog == NMI_IO_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_ioapic_nmi_watchdog();
-               else
-                       disable_ioapic_nmi_watchdog();
-       } else {
-               printk(KERN_WARNING
-                       "NMI watchdog doesn't know what hardware to touch\n");
-               return -EIO;
-       }
-       return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-       if (unknown_nmi_panic)
-               return unknown_nmi_panic_callback(regs, cpu);
-#endif
-       return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
-       int i;
-
-       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
-       printk(KERN_INFO "sending NMI to all CPUs:\n");
-       apic->send_IPI_all(NMI_VECTOR);
-
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(to_cpumask(backtrace_mask)))
-                       break;
-               mdelay(1);
-       }
-}
index 4b68bda..1d59834 100644 (file)
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
 #else
        vgetcpu_set_mode();
 #endif
-       init_hw_perf_events();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
index 6d75b91..0a360d1 100644 (file)
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
 {
        int i;
 
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               disable_lapic_nmi_watchdog();
-
        for (i = 0; i < x86_pmu.num_counters; i++) {
                if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
                        goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
        for (i--; i >= 0; i--)
                release_perfctr_nmi(x86_pmu.perfctr + i);
 
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
-
        return false;
 }
 
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
                release_perfctr_nmi(x86_pmu.perfctr + i);
                release_evntsel_nmi(x86_pmu.eventsel + i);
        }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
 }
 
 #else
@@ -384,15 +375,53 @@ static void release_pmc_hardware(void) {}
 static bool check_hw_exists(void)
 {
        u64 val, val_new = 0;
-       int ret = 0;
+       int i, reg, ret = 0;
+
+       /*
+        * Check to see if the BIOS enabled any of the counters, if so
+        * complain and bail.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu.eventsel + i;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+                       goto bios_fail;
+       }
 
+       if (x86_pmu.num_counters_fixed) {
+               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                       if (val & (0x03 << i*4))
+                               goto bios_fail;
+               }
+       }
+
+       /*
+        * Now write a value and read it back to see if it matches,
+        * this is needed to detect certain hardware emulators (qemu/kvm)
+        * that don't trap on the MSR access and always return 0s.
+        */
        val = 0xabcdUL;
-       ret |= checking_wrmsrl(x86_pmu.perfctr, val);
+       ret = checking_wrmsrl(x86_pmu.perfctr, val);
        ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
        if (ret || val != val_new)
-               return false;
+               goto msr_fail;
 
        return true;
+
+bios_fail:
+       printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
+       printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+       return false;
+
+msr_fail:
+       printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+       return false;
 }
 
 static void reserve_ds_buffers(void);
@@ -451,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event)
        struct hw_perf_event *hwc = &event->hw;
        u64 config;
 
-       if (!hwc->sample_period) {
+       if (!is_sampling_event(event)) {
                hwc->sample_period = x86_pmu.max_period;
                hwc->last_period = hwc->sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
@@ -1362,7 +1391,7 @@ static void __init pmu_check_apic(void)
        pr_info("no hardware sampling interrupt available.\n");
 }
 
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
 {
        struct event_constraint *c;
        int err;
@@ -1377,20 +1406,18 @@ void __init init_hw_perf_events(void)
                err = amd_pmu_init();
                break;
        default:
-               return;
+               return 0;
        }
        if (err != 0) {
                pr_cont("no PMU driver, software events only.\n");
-               return;
+               return 0;
        }
 
        pmu_check_apic();
 
        /* sanity check that the hardware exists or is emulated */
-       if (!check_hw_exists()) {
-               pr_cont("Broken PMU hardware detected, software events only.\n");
-               return;
-       }
+       if (!check_hw_exists())
+               return 0;
 
        pr_cont("%s PMU driver.\n", x86_pmu.name);
 
@@ -1438,9 +1465,12 @@ void __init init_hw_perf_events(void)
        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
        perf_cpu_notifier(x86_pmu_notifier);
+
+       return 0;
 }
+early_initcall(init_hw_perf_events);
 
 static inline void x86_pmu_read(struct perf_event *event)
 {
@@ -1686,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
        perf_callchain_store(entry, regs->ip);
 
-       dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+       dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
 }
 
 #ifdef CONFIG_COMPAT
index e421b8c..67e2202 100644 (file)
@@ -1,7 +1,5 @@
 #ifdef CONFIG_CPU_SUP_AMD
 
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-
 static __initconst const u64 amd_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -275,7 +273,7 @@ done:
        return &emptyconstraint;
 }
 
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+static struct amd_nb *amd_alloc_nb(int cpu)
 {
        struct amd_nb *nb;
        int i;
@@ -285,7 +283,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
        if (!nb)
                return NULL;
 
-       nb->nb_id = nb_id;
+       nb->nb_id = -1;
 
        /*
         * initialize all possible NB constraints
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu)
        if (boot_cpu_data.x86_max_cores < 2)
                return NOTIFY_OK;
 
-       cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+       cpuc->amd_nb = amd_alloc_nb(cpu);
        if (!cpuc->amd_nb)
                return NOTIFY_BAD;
 
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu)
        nb_id = amd_get_nb_id(cpu);
        WARN_ON_ONCE(nb_id == BAD_APICID);
 
-       raw_spin_lock(&amd_nb_lock);
-
        for_each_online_cpu(i) {
                nb = per_cpu(cpu_hw_events, i).amd_nb;
                if (WARN_ON_ONCE(!nb))
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu)
 
        cpuc->amd_nb->nb_id = nb_id;
        cpuc->amd_nb->refcnt++;
-
-       raw_spin_unlock(&amd_nb_lock);
 }
 
 static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu)
 
        cpuhw = &per_cpu(cpu_hw_events, cpu);
 
-       raw_spin_lock(&amd_nb_lock);
-
        if (cpuhw->amd_nb) {
                struct amd_nb *nb = cpuhw->amd_nb;
 
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu)
 
                cpuhw->amd_nb = NULL;
        }
-
-       raw_spin_unlock(&amd_nb_lock);
 }
 
 static __initconst const struct x86_pmu amd_pmu = {
index c8f5c08..24e390e 100644 (file)
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
        if (ret)
                return ret;
 
+       if (event->attr.precise_ip &&
+           (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.ANY_P
+                * (0x00c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * INST_RETIRED.ANY_P counts the number of cycles that retires
+                * CNTMASK instructions. By setting CNTMASK to a value (16)
+                * larger than the maximum number of instructions that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less instructions, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+
        if (event->attr.type != PERF_TYPE_RAW)
                return 0;
 
index d9f4ff8..d5a2366 100644 (file)
 #include <linux/kernel.h>
 #include <linux/bitops.h>
 #include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
 
-struct nmi_watchdog_ctlblk {
-       unsigned int cccr_msr;
-       unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
-       unsigned int evntsel_msr;  /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
-       int (*reserve)(void);
-       void (*unreserve)(void);
-       int (*setup)(unsigned nmi_hz);
-       void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
-       void (*stop)(void);
-       unsigned perfctr;
-       unsigned evntsel;
-       u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
 /*
  * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
  * offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
 static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
 static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
 
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
 /* converts an msr to an appropriate reservation bit */
 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
 {
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
        clear_bit(counter, evntsel_nmi_owner);
 }
 EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
-       BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-       if (atomic_read(&nmi_active) <= 0)
-               return;
-
-       on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
-       if (wd_ops)
-               wd_ops->unreserve();
-
-       BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
-       BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-       /* are we already enabled */
-       if (atomic_read(&nmi_active) != 0)
-               return;
-
-       /* are we lapic aware */
-       if (!wd_ops)
-               return;
-       if (!wd_ops->reserve()) {
-               printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
-               return;
-       }
-
-       on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
-       touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
-       u64 counter_val;
-       unsigned int retval = hz;
-
-       /*
-        * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
-        * are writable, with higher bits sign extending from bit 31.
-        * So, we can only program the counter with 31 bit values and
-        * 32nd bit should be 1, for 33.. to be 1.
-        * Find the appropriate nmi_hz
-        */
-       counter_val = (u64)cpu_khz * 1000;
-       do_div(counter_val, retval);
-       if (counter_val > 0x7fffffffULL) {
-               u64 count = (u64)cpu_khz * 1000;
-               do_div(count, 0x7fffffffUL);
-               retval = count + 1;
-       }
-       return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
-                               const char *descr, unsigned nmi_hz)
-{
-       u64 count = (u64)cpu_khz * 1000;
-
-       do_div(count, nmi_hz);
-       if (descr)
-               pr_debug("setting %s to -0x%08Lx\n", descr, count);
-       wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
-                               const char *descr, unsigned nmi_hz)
-{
-       u64 count = (u64)cpu_khz * 1000;
-
-       do_div(count, nmi_hz);
-       if (descr)
-               pr_debug("setting %s to -0x%08Lx\n", descr, count);
-       wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE      (1 << 22)
-#define K7_EVNTSEL_INT         (1 << 20)
-#define K7_EVNTSEL_OS          (1 << 17)
-#define K7_EVNTSEL_USR         (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING   0x76
-#define K7_NMI_EVENT           K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       wrmsrl(perfctr_msr, 0UL);
-
-       evntsel = K7_EVNTSEL_INT
-               | K7_EVNTSEL_OS
-               | K7_EVNTSEL_USR
-               | K7_NMI_EVENT;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
-       /* initialize the wd struct before enabling */
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= K7_EVNTSEL_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-
-       return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
-       if (!reserve_perfctr_nmi(wd_ops->perfctr))
-               return 0;
-
-       if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
-               release_perfctr_nmi(wd_ops->perfctr);
-               return 0;
-       }
-       return 1;
-}
-
-static void single_msr_unreserve(void)
-{
-       release_evntsel_nmi(wd_ops->evntsel);
-       release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       /* start the cycle over again */
-       write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_k7_watchdog,
-       .rearm          = single_msr_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_K7_PERFCTR0,
-       .evntsel        = MSR_K7_EVNTSEL0,
-       .checkbit       = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE     (1 << 22)
-#define P6_EVNTSEL_INT         (1 << 20)
-#define P6_EVNTSEL_OS          (1 << 17)
-#define P6_EVNTSEL_USR         (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT           P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       /* KVM doesn't implement this MSR */
-       if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
-               return 0;
-
-       evntsel = P6_EVNTSEL_INT
-               | P6_EVNTSEL_OS
-               | P6_EVNTSEL_USR
-               | P6_NMI_EVENT;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-       write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
-       /* initialize the wd struct before enabling */
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= P6_EVNTSEL0_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-
-       return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       /*
-        * P6 based Pentium M need to re-unmask
-        * the apic vector but it doesn't hurt
-        * other P6 variant.
-        * ArchPerfom/Core Duo also needs this
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       /* P6/ARCH_PERFMON has 32 bit counter write */
-       write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_p6_watchdog,
-       .rearm          = p6_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_P6_PERFCTR0,
-       .evntsel        = MSR_P6_EVNTSEL0,
-       .checkbit       = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL  (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N)        ((N) << 25)
-#define P4_ESCR_OS             (1 << 3)
-#define P4_ESCR_USR            (1 << 2)
-#define P4_CCCR_OVF_PMI0       (1 << 26)
-#define P4_CCCR_OVF_PMI1       (1 << 27)
-#define P4_CCCR_THRESHOLD(N)   ((N) << 20)
-#define P4_CCCR_COMPLEMENT     (1 << 19)
-#define P4_CCCR_COMPARE                (1 << 18)
-#define P4_CCCR_REQUIRED       (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE         (1 << 12)
-#define P4_CCCR_OVF            (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
-       MSR_P4_BPU_CCCR0,
-       MSR_P4_BPU_CCCR1,
-       MSR_P4_BPU_CCCR2,
-       MSR_P4_BPU_CCCR3,
-       MSR_P4_MS_CCCR0,
-       MSR_P4_MS_CCCR1,
-       MSR_P4_MS_CCCR2,
-       MSR_P4_MS_CCCR3,
-       MSR_P4_FLAME_CCCR0,
-       MSR_P4_FLAME_CCCR1,
-       MSR_P4_FLAME_CCCR2,
-       MSR_P4_FLAME_CCCR3,
-       MSR_P4_IQ_CCCR0,
-       MSR_P4_IQ_CCCR1,
-       MSR_P4_IQ_CCCR2,
-       MSR_P4_IQ_CCCR3,
-       MSR_P4_IQ_CCCR4,
-       MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr, cccr_msr;
-       unsigned int evntsel, cccr_val;
-       unsigned int misc_enable, dummy;
-       unsigned int ht_num;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
-       if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-               return 0;
-
-#ifdef CONFIG_SMP
-       /* detect which hyperthread we are on */
-       if (smp_num_siblings == 2) {
-               unsigned int ebx, apicid;
-
-               ebx = cpuid_ebx(1);
-               apicid = (ebx >> 24) & 0xff;
-               ht_num = apicid & 1;
-       } else
-#endif
-               ht_num = 0;
-
-       /*
-        * performance counters are shared resources
-        * assign each hyperthread its own set
-        * (re-use the ESCR0 register, seems safe
-        * and keeps the cccr_val the same)
-        */
-       if (!ht_num) {
-               /* logical cpu 0 */
-               perfctr_msr = MSR_P4_IQ_PERFCTR0;
-               evntsel_msr = MSR_P4_CRU_ESCR0;
-               cccr_msr = MSR_P4_IQ_CCCR0;
-               cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
-               /*
-                * If we're on the kdump kernel or other situation, we may
-                * still have other performance counter registers set to
-                * interrupt and they'll keep interrupting forever because
-                * of the P4_CCCR_OVF quirk. So we need to ACK all the
-                * pending interrupts and disable all the registers here,
-                * before reenabling the NMI delivery. Refer to p4_rearm()
-                * about the P4_CCCR_OVF quirk.
-                */
-               if (reset_devices) {
-                       unsigned int low, high;
-                       int i;
-
-                       for (i = 0; i < P4_CONTROLS; i++) {
-                               rdmsr(p4_controls[i], low, high);
-                               low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
-                               wrmsr(p4_controls[i], low, high);
-                       }
-               }
-       } else {
-               /* logical cpu 1 */
-               perfctr_msr = MSR_P4_IQ_PERFCTR1;
-               evntsel_msr = MSR_P4_CRU_ESCR0;
-               cccr_msr = MSR_P4_IQ_CCCR1;
-
-               /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
-               if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
-                       cccr_val = P4_CCCR_OVF_PMI0;
-               else
-                       cccr_val = P4_CCCR_OVF_PMI1;
-               cccr_val |= P4_CCCR_ESCR_SELECT(4);
-       }
-
-       evntsel = P4_ESCR_EVENT_SELECT(0x3F)
-               | P4_ESCR_OS
-               | P4_ESCR_USR;
-
-       cccr_val |= P4_CCCR_THRESHOLD(15)
-                | P4_CCCR_COMPLEMENT
-                | P4_CCCR_COMPARE
-                | P4_CCCR_REQUIRED;
-
-       wrmsr(evntsel_msr, evntsel, 0);
-       wrmsr(cccr_msr, cccr_val, 0);
-       write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = cccr_msr;
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       cccr_val |= P4_CCCR_ENABLE;
-       wrmsr(cccr_msr, cccr_val, 0);
-       return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       wrmsr(wd->cccr_msr, 0, 0);
-       wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
-       if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
-               return 0;
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
-               goto fail1;
-#endif
-       if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
-               goto fail2;
-       /* RED-PEN why is ESCR1 not reserved here? */
-       return 1;
- fail2:
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1)
-               release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
-       release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-       return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1)
-               release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
-       release_evntsel_nmi(MSR_P4_CRU_ESCR0);
-       release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       unsigned dummy;
-       /*
-        * P4 quirks:
-        * - An overflown perfctr will assert its interrupt
-        *   until the OVF flag in its CCCR is cleared.
-        * - LVTPC is masked on interrupt and must be
-        *   unmasked by the LVTPC handler.
-        */
-       rdmsrl(wd->cccr_msr, dummy);
-       dummy &= ~P4_CCCR_OVF;
-       wrmsrl(wd->cccr_msr, dummy);
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       /* start the cycle over again */
-       write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
-       .reserve        = p4_reserve,
-       .unreserve      = p4_unreserve,
-       .setup          = setup_p4_watchdog,
-       .rearm          = p4_rearm,
-       .stop           = stop_p4_watchdog,
-       /* RED-PEN this is wrong for the other sibling */
-       .perfctr        = MSR_P4_BPU_PERFCTR0,
-       .evntsel        = MSR_P4_BSU_ESCR0,
-       .checkbit       = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL     ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK   ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
-       unsigned int ebx;
-       union cpuid10_eax eax;
-       unsigned int unused;
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Unhalted Core Cycles Event or not.
-        * NOTE: Corresponding bit = 0 in ebx indicates event present.
-        */
-       cpuid(10, &(eax.full), &ebx, &unused, &unused);
-       if ((eax.split.mask_length <
-                       (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-           (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-               return 0;
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       wrmsrl(perfctr_msr, 0UL);
-
-       evntsel = ARCH_PERFMON_EVENTSEL_INT
-               | ARCH_PERFMON_EVENTSEL_OS
-               | ARCH_PERFMON_EVENTSEL_USR
-               | ARCH_PERFMON_NMI_EVENT_SEL
-               | ARCH_PERFMON_NMI_EVENT_UMASK;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-       write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-       intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
-       return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_intel_arch_watchdog,
-       .rearm          = p6_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_ARCH_PERFMON_PERFCTR1,
-       .evntsel        = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_AMD:
-               if (boot_cpu_data.x86 == 6 ||
-                   (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
-                       wd_ops = &k7_wd_ops;
-               return;
-       case X86_VENDOR_INTEL:
-               /* Work around where perfctr1 doesn't have a working enable
-                * bit as described in the following errata:
-                * AE49 Core Duo and Intel Core Solo 65 nm
-                * AN49 Intel Pentium Dual-Core
-                * AF49 Dual-Core Intel Xeon Processor LV
-                */
-               if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
-                   ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
-                    boot_cpu_data.x86_mask == 4))) {
-                       intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
-                       intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
-               }
-               if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-                       wd_ops = &intel_arch_wd_ops;
-                       break;
-               }
-               switch (boot_cpu_data.x86) {
-               case 6:
-                       if (boot_cpu_data.x86_model > 13)
-                               return;
-
-                       wd_ops = &p6_wd_ops;
-                       break;
-               case 15:
-                       wd_ops = &p4_wd_ops;
-                       break;
-               default:
-                       return;
-               }
-               break;
-       }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
-       if (!wd_ops) {
-               probe_nmi_watchdog();
-               if (!wd_ops) {
-                       printk(KERN_INFO "NMI watchdog: CPU not supported\n");
-                       return -1;
-               }
-
-               if (!wd_ops->reserve()) {
-                       printk(KERN_ERR
-                               "NMI watchdog: cannot reserve perfctrs\n");
-                       return -1;
-               }
-       }
-
-       if (!(wd_ops->setup(nmi_hz))) {
-               printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
-                      raw_smp_processor_id());
-               return -1;
-       }
-
-       return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
-       if (wd_ops)
-               wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
-           wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
-               hz = adjust_for_32bit_ctr(hz);
-       return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       u64 ctr;
-
-       rdmsrl(wd->perfctr_msr, ctr);
-       if (ctr & wd_ops->checkbit) /* perfctr still running? */
-               return 0;
-
-       wd_ops->rearm(wd, nmi_hz);
-       return 1;
-}
index 6e8752c..8474c99 100644 (file)
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
 
 void
 show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
+               unsigned long *stack, char *log_lvl)
 {
        printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
 }
 
 void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
+               unsigned long *stack)
 {
-       show_trace_log_lvl(task, regs, stack, bp, "");
+       show_trace_log_lvl(task, regs, stack, "");
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
-       show_stack_log_lvl(task, NULL, sp, 0, "");
+       show_stack_log_lvl(task, NULL, sp, "");
 }
 
 /*
@@ -210,7 +210,7 @@ void dump_stack(void)
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
+       show_trace(NULL, NULL, &stack);
 }
 EXPORT_SYMBOL(dump_stack);
 
index 1bc7f75..74cc1ed 100644 (file)
 #include <asm/stacktrace.h>
 
 
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+               struct pt_regs *regs, unsigned long *stack,
                const struct stacktrace_ops *ops, void *data)
 {
        int graph = 0;
+       unsigned long bp;
 
        if (!task)
                task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                        stack = (unsigned long *)task->thread.sp;
        }
 
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp) {
-               if (task == current) {
-                       /* Grab bp right from our regs */
-                       get_bp(bp);
-               } else {
-                       /* bp is the last reg pushed by switch_to */
-                       bp = *(unsigned long *) task->thread.sp;
-               }
-       }
-#endif
-
+       bp = stack_frame(task, regs);
        for (;;) {
                struct thread_info *context;
 
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
 
 void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+                  unsigned long *sp, char *log_lvl)
 {
        unsigned long *stack;
        int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                touch_nmi_watchdog();
        }
        printk(KERN_CONT "\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
 }
 
 
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
                u8 *ip;
 
                printk(KERN_EMERG "Stack:\n");
-               show_stack_log_lvl(NULL, regs, &regs->sp,
-                               0, KERN_EMERG);
+               show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
 
                printk(KERN_EMERG "Code: ");
 
index 6a34048..6410133 100644 (file)
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+               struct pt_regs *regs, unsigned long *stack,
                const struct stacktrace_ops *ops, void *data)
 {
        const unsigned cpu = get_cpu();
@@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        unsigned used = 0;
        struct thread_info *tinfo;
        int graph = 0;
+       unsigned long bp;
 
        if (!task)
                task = current;
@@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                        stack = (unsigned long *)task->thread.sp;
        }
 
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp) {
-               if (task == current) {
-                       /* Grab bp right from our regs */
-                       get_bp(bp);
-               } else {
-                       /* bp is the last reg pushed by switch_to */
-                       bp = *(unsigned long *) task->thread.sp;
-               }
-       }
-#endif
-
+       bp = stack_frame(task, regs);
        /*
         * Print function call entries in all stacks, starting at the
         * current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
 
 void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+                  unsigned long *sp, char *log_lvl)
 {
        unsigned long *irq_stack_end;
        unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        preempt_enable();
 
        printk(KERN_CONT "\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
 }
 
 void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
 
                printk(KERN_EMERG "Stack:\n");
                show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-                               regs->bp, KERN_EMERG);
+                                  KERN_EMERG);
 
                printk(KERN_EMERG "Code: ");
 
index 1cbd54c..5940282 100644 (file)
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
 {
        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
+       /* This is possible if op is under delayed unoptimizing */
+       if (kprobe_disabled(&op->kp))
+               return;
+
        preempt_disable();
        if (kprobe_running()) {
                kprobes_inc_nmissed_count(&op->kp);
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
        return 0;
 }
 
-/* Replace a breakpoint (int3) with a relative jump.  */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+       u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+                                           u8 *insn_buf,
+                                           struct optimized_kprobe *op)
 {
-       unsigned char jmp_code[RELATIVEJUMP_SIZE];
        s32 rel = (s32)((long)op->optinsn.insn -
                        ((long)op->kp.addr + RELATIVEJUMP_SIZE));
 
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
        memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
               RELATIVE_ADDR_SIZE);
 
-       jmp_code[0] = RELATIVEJUMP_OPCODE;
-       *(s32 *)(&jmp_code[1]) = rel;
+       insn_buf[0] = RELATIVEJUMP_OPCODE;
+       *(s32 *)(&insn_buf[1]) = rel;
+
+       tprm->addr = op->kp.addr;
+       tprm->opcode = insn_buf;
+       tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+       struct optimized_kprobe *op, *tmp;
+       int c = 0;
+
+       list_for_each_entry_safe(op, tmp, oplist, list) {
+               WARN_ON(kprobe_disabled(&op->kp));
+               /* Setup param */
+               setup_optimize_kprobe(&jump_poke_params[c],
+                                     jump_poke_bufs[c].buf, op);
+               list_del_init(&op->list);
+               if (++c >= MAX_OPTIMIZE_PROBES)
+                       break;
+       }
 
        /*
         * text_poke_smp doesn't support NMI/MCE code modifying.
         * However, since kprobes itself also doesn't support NMI/MCE
         * code probing, it's not a problem.
         */
-       text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
-       return 0;
+       text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+                                             u8 *insn_buf,
+                                             struct optimized_kprobe *op)
+{
+       /* Set int3 to first byte for kprobes */
+       insn_buf[0] = BREAKPOINT_INSTRUCTION;
+       memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+       tprm->addr = op->kp.addr;
+       tprm->opcode = insn_buf;
+       tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                   struct list_head *done_list)
+{
+       struct optimized_kprobe *op, *tmp;
+       int c = 0;
+
+       list_for_each_entry_safe(op, tmp, oplist, list) {
+               /* Setup param */
+               setup_unoptimize_kprobe(&jump_poke_params[c],
+                                       jump_poke_bufs[c].buf, op);
+               list_move(&op->list, done_list);
+               if (++c >= MAX_OPTIMIZE_PROBES)
+                       break;
+       }
+
+       /*
+        * text_poke_smp doesn't support NMI/MCE code modifying.
+        * However, since kprobes itself also doesn't support NMI/MCE
+        * code probing, it's not a problem.
+        */
+       text_poke_smp_batch(jump_poke_params, c);
 }
 
 /* Replace a relative jump with a breakpoint (int3).  */
@@ -1453,11 +1526,35 @@ static int  __kprobes setup_detour_execution(struct kprobe *p,
        }
        return 0;
 }
+
+static int __kprobes init_poke_params(void)
+{
+       /* Allocate code buffer and parameter array */
+       jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+                                MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+       if (!jump_poke_bufs)
+               return -ENOMEM;
+
+       jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+                                  MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+       if (!jump_poke_params) {
+               kfree(jump_poke_bufs);
+               jump_poke_bufs = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+#else  /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+       return 0;
+}
 #endif
 
 int __init arch_init_kprobes(void)
 {
-       return 0;
+       return init_poke_params();
 }
 
 int __kprobes arch_trampoline_kprobe(struct kprobe *p)
index 57d1868..c852041 100644 (file)
@@ -91,8 +91,7 @@ void exit_thread(void)
 void show_regs(struct pt_regs *regs)
 {
        show_registers(regs);
-       show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
-                  regs->bp);
+       show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
 }
 
 void show_regs_common(void)
@@ -374,6 +373,7 @@ void default_idle(void)
 {
        if (hlt_use_halt()) {
                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+               trace_cpu_idle(1, smp_processor_id());
                current_thread_info()->status &= ~TS_POLLING;
                /*
                 * TS_POLLING-cleared state must be visible before we
@@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
        trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+       trace_cpu_idle((ax>>4)+1, smp_processor_id());
        if (!need_resched()) {
                if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
@@ -460,6 +461,7 @@ static void mwait_idle(void)
 {
        if (!need_resched()) {
                trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+               trace_cpu_idle(1, smp_processor_id());
                if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
                        clflush((void *)&current_thread_info()->flags);
 
@@ -481,10 +483,12 @@ static void mwait_idle(void)
 static void poll_idle(void)
 {
        trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+       trace_cpu_idle(0, smp_processor_id());
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
-       trace_power_end(0);
+       trace_power_end(smp_processor_id());
+       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
 }
 
 /*
index 96586c3..4b9befa 100644 (file)
@@ -113,8 +113,8 @@ void cpu_idle(void)
                        stop_critical_timings();
                        pm_idle();
                        start_critical_timings();
-
                        trace_power_end(smp_processor_id());
+                       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
                }
                tick_nohz_restart_sched_tick();
                preempt_enable_no_resched();
index b3d7a3a..4c818a7 100644 (file)
@@ -142,6 +142,8 @@ void cpu_idle(void)
                        start_critical_timings();
 
                        trace_power_end(smp_processor_id());
+                       trace_cpu_idle(PWR_EVENT_EXIT,
+                                      smp_processor_id());
 
                        /* In many cases the interrupt that ended idle
                           has already called exit_idle. But some idle
index 083e99d..68f61ac 100644 (file)
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
         */
        smp_store_cpu_info(cpuid);
 
+       /*
+        * This must be done before setting cpu_online_mask
+        * or calling notify_cpu_starting.
+        */
+       set_cpu_sibling_map(raw_smp_processor_id());
+       wmb();
+
        notify_cpu_starting(cpuid);
 
        /*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
         */
        check_tsc_sync_target();
 
-       if (nmi_watchdog == NMI_IO_APIC) {
-               legacy_pic->mask(0);
-               enable_NMI_through_LVT0();
-               legacy_pic->unmask(0);
-       }
-
-       /* This must be done before setting cpu_online_mask */
-       set_cpu_sibling_map(raw_smp_processor_id());
-       wmb();
-
        /*
         * We need to hold call_lock, so there is no inconsistency
         * between the time smp_call_function() determines number of
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
                printk(KERN_INFO "SMP mode deactivated.\n");
                smpboot_clear_io_apic();
 
-               localise_nmi_watchdog();
-
                connect_bsp_APIC();
                setup_local_APIC();
                end_local_APIC_setup();
@@ -1196,7 +1191,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_X86_IO_APIC
        setup_ioapic_dest();
 #endif
-       check_nmi_watchdog();
        mtrr_aps_init();
 }
 
@@ -1341,8 +1335,6 @@ int native_cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
        clear_local_APIC();
 
        cpu_disable_common();
index b53c525..938c8e1 100644 (file)
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
  */
 void save_stack_trace(struct stack_trace *trace)
 {
-       dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
+       dump_trace(current, NULL, NULL, &save_stack_ops, trace);
        if (trace->nr_entries < trace->max_entries)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
 {
-       dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+       dump_trace(current, regs, NULL, &save_stack_ops, trace);
        if (trace->nr_entries < trace->max_entries)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-       dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
+       dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
        if (trace->nr_entries < trace->max_entries)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
index fb5cc5e..25a28a2 100644 (file)
 #include <asm/hpet.h>
 #include <asm/time.h>
 
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
 #ifdef CONFIG_X86_64
 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
 #endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
        /* Keep nmi watchdog up to date */
        inc_irq_stat(irq0_irqs);
 
-       /* Optimized out for !IO_APIC and x86_64 */
-       if (timer_ack) {
-               /*
-                * Subtle, when I/O APICs are used we have to ack timer IRQ
-                * manually to deassert NMI lines for the watchdog if run
-                * on an 82489DX-based system.
-                */
-               raw_spin_lock(&i8259A_lock);
-               outb(0x0c, PIC_MASTER_OCW3);
-               /* Ack the IRQ; AEOI will end it automatically. */
-               inb(PIC_MASTER_POLL);
-               raw_spin_unlock(&i8259A_lock);
-       }
-
        global_clock_event->event_handler(global_clock_event);
 
        /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
index cb838ca..c76aaca 100644 (file)
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 static int ignore_nmis;
 
+int unknown_nmi_panic;
+
 static inline void conditional_sti(struct pt_regs *regs)
 {
        if (regs->flags & X86_EFLAGS_IF)
@@ -300,6 +302,13 @@ gp_in_kernel:
        die("general protection fault", regs, error_code);
 }
 
+static int __init setup_unknown_nmi_panic(char *str)
+{
+       unknown_nmi_panic = 1;
+       return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
 static notrace __kprobes void
 mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
@@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
        reason = (reason & 0xf) | 8;
        outb(reason, 0x61);
 
-       i = 2000;
-       while (--i)
-               udelay(1000);
+       i = 20000;
+       while (--i) {
+               touch_nmi_watchdog();
+               udelay(100);
+       }
 
        reason &= ~8;
        outb(reason, 0x61);
@@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
                        reason, smp_processor_id());
 
        printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-       if (panic_on_unrecovered_nmi)
+       if (unknown_nmi_panic || panic_on_unrecovered_nmi)
                panic("NMI: Not continuing");
 
        printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
@@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
                if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
                                                        == NOTIFY_STOP)
                        return;
-
-#ifndef CONFIG_LOCKUP_DETECTOR
-               /*
-                * Ok, so this is none of the documented NMI sources,
-                * so it must be the NMI watchdog.
-                */
-               if (nmi_watchdog_tick(regs, reason))
-                       return;
-               if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
-                       unknown_nmi_error(reason, regs);
-#else
-               unknown_nmi_error(reason, regs);
 #endif
+               unknown_nmi_error(reason, regs);
 
                return;
        }
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 void stop_nmi(void)
 {
-       acpi_nmi_disable();
        ignore_nmis++;
 }
 
 void restart_nmi(void)
 {
        ignore_nmis--;
-       acpi_nmi_enable();
 }
 
 /* May run on IST stack. */
index af3b6c8..704a37c 100644 (file)
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
        e->trace.entries = e->trace_entries;
        e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
        e->trace.skip = 0;
-       save_stack_trace_bp(&e->trace, regs->bp);
+       save_stack_trace_regs(&e->trace, regs);
 
        /* Round address down to nearest 16 bytes */
        shadow_copy = kmemcheck_shadow_lookup(address
index 2d49d4e..72cbec1 100644 (file)
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
        if (!user_mode_vm(regs)) {
                unsigned long stack = kernel_stack_pointer(regs);
                if (depth)
-                       dump_trace(NULL, regs, (unsigned long *)stack, 0,
+                       dump_trace(NULL, regs, (unsigned long *)stack,
                                   &backtrace_ops, &depth);
                return;
        }
index 4e8baad..358c8b9 100644 (file)
@@ -732,6 +732,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
                case 0x14:
                        cpu_type = "x86-64/family14h";
                        break;
+               case 0x15:
+                       cpu_type = "x86-64/family15h";
+                       break;
                default:
                        return -ENODEV;
                }
index e3ecb71..0636dd9 100644 (file)
@@ -58,9 +58,6 @@ static void timer_stop(void)
 
 int __init op_nmi_timer_init(struct oprofile_operations *ops)
 {
-       if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
-               return -ENODEV;
-
        ops->start = timer_start;
        ops->stop = timer_stop;
        ops->cpu_type = "timer";
index 7d90d47..51104b3 100644 (file)
 #include "op_x86_model.h"
 #include "op_counter.h"
 
-#define NUM_COUNTERS 4
+#define NUM_COUNTERS           4
+#define NUM_COUNTERS_F15H      6
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
+#define NUM_VIRT_COUNTERS      32
 #else
-#define NUM_VIRT_COUNTERS NUM_COUNTERS
+#define NUM_VIRT_COUNTERS      0
 #endif
 
 #define OP_EVENT_MASK                  0x0FFF
@@ -41,7 +42,8 @@
 
 #define MSR_AMD_EVENTSEL_RESERVED      ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
 
-static unsigned long reset_value[NUM_VIRT_COUNTERS];
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
 
 #define IBS_FETCH_SIZE                 6
 #define IBS_OP_SIZE                    12
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
        int i;
 
        /* enable active counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                int virt = op_x86_phys_to_virt(i);
                if (!reset_value[virt])
                        continue;
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 {
        int i;
 
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                if (!msrs->counters[i].addr)
                        continue;
                release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
        int i;
 
-       for (i = 0; i < NUM_COUNTERS; i++) {
+       for (i = 0; i < num_counters; i++) {
                if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
                        goto fail;
                if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
                        goto fail;
                }
                /* both registers must be reserved */
-               msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-               msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+               if (num_counters == NUM_COUNTERS_F15H) {
+                       msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+                       msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+               } else {
+                       msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+                       msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+               }
                continue;
        fail:
                if (!counter_config[i].enabled)
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
        int i;
 
        /* setup reset_value */
-       for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+       for (i = 0; i < OP_MAX_COUNTER; ++i) {
                if (counter_config[i].enabled
                    && msrs->counters[op_x86_virt_to_phys(i)].addr)
                        reset_value[i] = counter_config[i].count;
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
        }
 
        /* clear all counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                if (!msrs->controls[i].addr)
                        continue;
                rdmsrl(msrs->controls[i].addr, val);
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
        }
 
        /* enable active counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                int virt = op_x86_phys_to_virt(i);
                if (!reset_value[virt])
                        continue;
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
        u64 val;
        int i;
 
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                int virt = op_x86_phys_to_virt(i);
                if (!reset_value[virt])
                        continue;
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
        u64 val;
        int i;
 
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                if (!reset_value[op_x86_phys_to_virt(i)])
                        continue;
                rdmsrl(msrs->controls[i].addr, val);
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
         * Subtle: stop on all counters to avoid race with setting our
         * pm callback
         */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                if (!reset_value[op_x86_phys_to_virt(i)])
                        continue;
                rdmsrl(msrs->controls[i].addr, val);
@@ -706,18 +713,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
        return 0;
 }
 
+struct op_x86_model_spec op_amd_spec;
+
 static int op_amd_init(struct oprofile_operations *ops)
 {
        init_ibs();
        create_arch_files = ops->create_files;
        ops->create_files = setup_ibs_files;
+
+       if (boot_cpu_data.x86 == 0x15) {
+               num_counters = NUM_COUNTERS_F15H;
+       } else {
+               num_counters = NUM_COUNTERS;
+       }
+
+       op_amd_spec.num_counters = num_counters;
+       op_amd_spec.num_controls = num_counters;
+       op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
        return 0;
 }
 
 struct op_x86_model_spec op_amd_spec = {
-       .num_counters           = NUM_COUNTERS,
-       .num_controls           = NUM_COUNTERS,
-       .num_virt_counters      = NUM_VIRT_COUNTERS,
+       /* num_counters/num_controls filled in at runtime */
        .reserved               = MSR_AMD_EVENTSEL_RESERVED,
        .event_mask             = OP_EVENT_MASK,
        .init                   = op_amd_init,
index 182558d..9fadec0 100644 (file)
@@ -11,7 +11,7 @@
 #include <linux/oprofile.h>
 #include <linux/smp.h>
 #include <linux/ptrace.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
 #include <asm/msr.h>
 #include <asm/fixmap.h>
 #include <asm/apic.h>
index 660a272..0cac7ec 100644 (file)
@@ -577,9 +577,7 @@ acpi_ns_init_one_device(acpi_handle obj_handle,
         * as possible (without an NMI being received in the middle of
         * this) - so disable NMIs and initialize the device:
         */
-       acpi_nmi_disable();
        status = acpi_ns_evaluate(info);
-       acpi_nmi_enable();
 
        if (ACPI_SUCCESS(status)) {
                walk_info->num_INI++;
index c63a438..1109f68 100644 (file)
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
                dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
                        (unsigned long)freqs->cpu);
                trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+               trace_cpu_frequency(freqs->new, freqs->cpu);
                srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
                                CPUFREQ_POSTCHANGE, freqs);
                if (likely(policy) && likely(policy->cpu == freqs->cpu))
index a507108..08d5f05 100644 (file)
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
        if (cpuidle_curr_governor->reflect)
                cpuidle_curr_governor->reflect(dev);
        trace_power_end(smp_processor_id());
+       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
 }
 
 /**
index c131d58..56ac09d 100644 (file)
@@ -220,9 +220,8 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
        kt_before = ktime_get_real();
 
        stop_critical_timings();
-#ifndef MODULE
        trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
-#endif
+       trace_cpu_idle((eax >> 4) + 1, cpu);
        if (!need_resched()) {
 
                __monitor((void *)&current_thread_info()->flags, 0, 0);
index 3d77116..dea7b5b 100644 (file)
@@ -642,19 +642,14 @@ static struct notifier_block die_notifier = {
  */
 
 #ifdef CONFIG_HPWDT_NMI_DECODING
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#ifdef CONFIG_X86_LOCAL_APIC
 static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
 {
        /*
         * If nmi_watchdog is turned off then we can turn on
         * our nmi decoding capability.
         */
-       if (!nmi_watchdog_active())
-               hpwdt_nmi_decoding = 1;
-       else
-               dev_warn(&dev->dev, "NMI decoding is disabled. To enable this "
-                       "functionality you must reboot with nmi_watchdog=0 "
-                       "and load the hpwdt driver with priority=1.\n");
+       hpwdt_nmi_decoding = 1;
 }
 #else
 static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
@@ -662,7 +657,7 @@ static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
        dev_warn(&dev->dev, "NMI decoding is disabled. "
                "Your kernel does not support a NMI Watchdog.\n");
 }
-#endif /* ARCH_HAS_NMI_WATCHDOG */
+#endif /* CONFIG_X86_LOCAL_APIC */
 
 static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev)
 {
index 5476c06..3c4039d 100644 (file)
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        int metadata;
        unsigned int revokes = 0;
        int x;
-       int error;
+       int error = 0;
 
        if (!*top)
                sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (metadata)
                revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
 
-       error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       else if (!sdp->sd_rgrps)
+               error = gfs2_ri_update(ip);
+
        if (error)
                return error;
 
@@ -879,7 +883,8 @@ out_rg_gunlock:
 out_rlist:
        gfs2_rlist_free(&rlist);
 out:
-       gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
        return error;
 }
 
index f92c177..08a8beb 100644 (file)
@@ -541,21 +541,6 @@ out_locked:
        spin_unlock(&gl->gl_spin);
 }
 
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                unsigned int req_state,
-                                unsigned int flags)
-{
-       int ret = LM_OUT_ERROR;
-
-       if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-               return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
-       if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-               ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                        req_state, flags);
-       return ret;
-}
-
 /**
  * do_xmote - Calls the DLM to change the state of a lock
  * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
 
        lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                      LM_FLAG_PRIORITY);
-       BUG_ON(gl->gl_state == target);
-       BUG_ON(gl->gl_state == gl->gl_target);
+       GLOCK_BUG_ON(gl, gl->gl_state == target);
+       GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
        if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
            glops->go_inval) {
                set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                do_error(gl, 0); /* Fail queued try locks */
        }
+       gl->gl_req = target;
        spin_unlock(&gl->gl_spin);
        if (glops->go_xmote_th)
                glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
            gl->gl_state == LM_ST_DEFERRED) &&
            !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                lck_flags |= LM_FLAG_TRY_1CB;
-       ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
-       if (!(ret & LM_OUT_ASYNC)) {
-               finish_xmote(gl, ret);
+       if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+               /* lock_dlm */
+               ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+               GLOCK_BUG_ON(gl, ret);
+       } else { /* lock_nolock */
+               finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                        gfs2_glock_put(gl);
-       } else {
-               GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
        }
+
        spin_lock(&gl->gl_spin);
 }
 
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
 
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
+       struct va_format vaf;
        va_list args;
 
        va_start(args, fmt);
+
        if (seq) {
                struct gfs2_glock_iter *gi = seq->private;
                vsprintf(gi->string, fmt, args);
                seq_printf(seq, gi->string);
        } else {
-               printk(KERN_ERR " ");
-               vprintk(fmt, args);
+               vaf.fmt = fmt;
+               vaf.va = &args;
+
+               printk(KERN_ERR " %pV", &vaf);
        }
+
        va_end(args);
 }
 
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
  * @gl: Pointer to the glock
  * @ret: The return value from the dlm
  *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
  */
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
 
+       spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
 
        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-               spin_lock(&gl->gl_spin);
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
                        return;
                }
-               spin_unlock(&gl->gl_spin);
        }
+
+       spin_unlock(&gl->gl_spin);
        set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+       smp_wmb();
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
        struct task_struct *gh_owner = NULL;
-       char buffer[KSYM_SYMBOL_LEN];
        char flags_buf[32];
 
-       sprint_symbol(buffer, gh->gh_ip);
        if (gh->gh_owner_pid)
                gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                 state2str(gh->gh_state),
-                 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                 gh->gh_error, 
-                 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                 gh_owner ? gh_owner->comm : "(ended)", buffer);
+       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+                      state2str(gh->gh_state),
+                      hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                      gh->gh_error,
+                      gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                      gh_owner ? gh_owner->comm : "(ended)",
+                      (void *)gh->gh_ip);
        return 0;
 }
 
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
        }
 #endif
 
-       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
-       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
-                                               WQ_FREEZEABLE, 0);
+       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+                                               WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                               0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
                return PTR_ERR(gfs2_delete_workqueue);
index db1c26d..691851c 100644 (file)
@@ -87,11 +87,10 @@ enum {
 #define GL_ASYNC               0x00000040
 #define GL_EXACT               0x00000080
 #define GL_SKIP                        0x00000100
-#define GL_ATIME               0x00000200
 #define GL_NOCACHE             0x00000400
   
 /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
  *
  * LM_OUT_ST_MASK
  * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
  * LM_OUT_CANCELED
  * The lock request was canceled.
  *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
  */
 
 #define LM_OUT_ST_MASK         0x00000003
 #define LM_OUT_CANCELED                0x00000008
-#define LM_OUT_ASYNC           0x00000080
-#define LM_OUT_ERROR           0x00000100
+#define LM_OUT_ERROR           0x00000004
 
 /*
  * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-       unsigned int (*lm_lock) (struct gfs2_glock *gl,
-                                unsigned int req_state, unsigned int flags);
+       int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+                       unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
        const match_table_t *lm_tokens;
 };
 
-#define LM_FLAG_TRY            0x00000001
-#define LM_FLAG_TRY_1CB                0x00000002
-#define LM_FLAG_NOEXP          0x00000004
-#define LM_FLAG_ANY            0x00000008
-#define LM_FLAG_PRIORITY       0x00000010
-
-#define GL_ASYNC               0x00000040
-#define GL_EXACT               0x00000080
-#define GL_SKIP                        0x00000100
-#define GL_NOCACHE             0x00000400
-
-#define GLR_TRYFAILED          13
-
 extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
index 0d149dc..263561b 100644 (file)
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
 
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-               flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
index 764fbb4..8d3d2b4 100644 (file)
@@ -207,12 +207,14 @@ struct gfs2_glock {
 
        spinlock_t gl_spin;
 
-       unsigned int gl_state;
-       unsigned int gl_target;
-       unsigned int gl_reply;
+       /* State fields protected by gl_spin */
+       unsigned int gl_state:2,        /* Current state */
+                    gl_target:2,       /* Target state */
+                    gl_demote_state:2, /* State requested by remote node */
+                    gl_req:2,          /* State in last dlm request */
+                    gl_reply:8;        /* Last reply from the dlm */
+
        unsigned int gl_hash;
-       unsigned int gl_req;
-       unsigned int gl_demote_state; /* state requested by remote node */
        unsigned long gl_demote_time; /* time of first demote request */
        struct list_head gl_holders;
 
index e1213f7..14e682d 100644 (file)
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
        if (error)
                return error;
 
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               error = vmtruncate(inode, attr->ia_size);
-               if (error)
-                       return error;
-       }
-
        setattr_copy(inode, attr);
        mark_inode_dirty(inode);
-
-       gfs2_assert_warn(GFS2_SB(inode), !error);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
index 1c09425..6e493ae 100644 (file)
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
        return lkf;
 }
 
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
-                             unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+                    unsigned int flags)
 {
        struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-       int error;
        int req;
        u32 lkf;
 
-       gl->gl_req = req_state;
        req = make_mode(req_state);
        lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
 
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
         * Submit the actual lock request.
         */
 
-       error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-       if (error == -EAGAIN)
-               return 0;
-       if (error)
-               return LM_OUT_ERROR;
-       return LM_OUT_ASYNC;
+       return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+                       GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
 
 static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
index 12cbea7..1db6b73 100644 (file)
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-       struct buffer_head *dibh;
        u32 ouid, ogid, nuid, ngid;
        int error;
 
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
        if (error)
                goto out_gunlock_q;
 
-       error = gfs2_meta_inode_buffer(ip, &dibh);
+       error = gfs2_setattr_simple(ip, attr);
        if (error)
                goto out_end_trans;
 
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(sdp, !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
        if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                gfs2_quota_change(ip, -blocks, ouid, ogid);
index f606baf..a689901 100644 (file)
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                        qd->qd_qb.qb_limit = qp->qu_limit;
                }
+               if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                       qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_value = qp->qu_value;
+               }
        }
 
        /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
 
 static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                          struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+       if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+           ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+               fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
        if (fdq->d_fieldmask == 0)
                goto out_i;
 
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
        .get_dqblk      = gfs2_get_dqblk,
        .set_dqblk      = gfs2_set_dqblk,
 };
-
index 33c8407..7293ea2 100644 (file)
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-               if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+               if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
  * Returns: 0 on successful update, error code otherwise
  */
 
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
@@ -614,46 +614,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 }
 
 /**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-       struct inode *inode = &ip->i_inode;
-       struct file_ra_state ra_state;
-       struct gfs2_rgrpd *rgd;
-       unsigned int max_data = 0;
-       int error;
-
-       file_ra_state_init(&ra_state, inode->i_mapping);
-       for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-               /* Ignore partials */
-               if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                   i_size_read(inode))
-                       break;
-               error = read_rindex_entry(ip, &ra_state);
-               if (error) {
-                       clear_rgrpdi(sdp);
-                       return error;
-               }
-       }
-       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-               if (rgd->rd_data > max_data)
-                       max_data = rgd->rd_data;
-       sdp->sd_max_rg_data = max_data;
-
-       sdp->sd_rindex_uptodate = 1;
-       return 0;
-}
-
-/**
  * gfs2_rindex_hold - Grab a lock on the rindex
  * @sdp: The GFS2 superblock
  * @ri_gh: the glock holder
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
                        error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                            in, so: */
-                       error = gfs2_ri_update_special(ip);
+                       error = gfs2_ri_update(ip);
                if (error)
                        return error;
        }
 
+try_again:
        do {
                error = get_local_rgrp(ip, &last_unlinked);
                /* If there is no space, flushing the log may release some */
-               if (error)
+               if (error) {
+                       if (ip == GFS2_I(sdp->sd_rindex) &&
+                           !sdp->sd_rindex_uptodate) {
+                               error = gfs2_ri_update(ip);
+                               if (error)
+                                       return error;
+                               goto try_again;
+                       }
                        gfs2_log_flush(sdp, NULL);
+               }
        } while (error && tries++ < 3);
 
        if (error) {
index 0e35c04..50c2bb0 100644 (file)
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
 
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
+extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
index 30b58f0..439b61c 100644 (file)
@@ -1296,10 +1296,8 @@ fail:
 
 int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 {
-       struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_ea_location el;
-       struct buffer_head *dibh;
        int error;
 
        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (error)
                return error;
 
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto out_trans_end;
-
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(GFS2_SB(inode), !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
-out_trans_end:
+       error = gfs2_setattr_simple(ip, attr);
        gfs2_trans_end(sdp);
        return error;
 }
index 8beabb9..47e3997 100644 (file)
@@ -154,12 +154,14 @@ enum {
        TRACE_EVENT_FL_ENABLED_BIT,
        TRACE_EVENT_FL_FILTERED_BIT,
        TRACE_EVENT_FL_RECORDED_CMD_BIT,
+       TRACE_EVENT_FL_CAP_ANY_BIT,
 };
 
 enum {
        TRACE_EVENT_FL_ENABLED          = (1 << TRACE_EVENT_FL_ENABLED_BIT),
        TRACE_EVENT_FL_FILTERED         = (1 << TRACE_EVENT_FL_FILTERED_BIT),
        TRACE_EVENT_FL_RECORDED_CMD     = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
+       TRACE_EVENT_FL_CAP_ANY          = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
 };
 
 struct ftrace_event_call {
@@ -196,6 +198,14 @@ struct ftrace_event_call {
 #endif
 };
 
+#define __TRACE_EVENT_FLAGS(name, value)                               \
+       static int __init trace_init_flags_##name(void)                 \
+       {                                                               \
+               event_##name.flags = value;                             \
+               return 0;                                               \
+       }                                                               \
+       early_initcall(trace_init_flags_##name);
+
 #define PERF_MAX_TRACE_SIZE    2048
 
 #define MAX_FILTER_PRED                32
@@ -215,6 +225,10 @@ enum {
        FILTER_PTR_STRING,
 };
 
+#define EVENT_STORAGE_SIZE 128
+extern struct mutex event_storage_mutex;
+extern char event_storage[EVENT_STORAGE_SIZE];
+
 extern int trace_event_raw_init(struct ftrace_event_call *call);
 extern int trace_define_field(struct ftrace_event_call *call, const char *type,
                              const char *name, int offset, int size,
index 6ed8812..caa151f 100644 (file)
@@ -90,6 +90,12 @@ extern struct group_info init_groups;
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
 
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()                                          \
+       .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()                                   \
        .rcu_blocked_node = NULL,
@@ -101,7 +107,8 @@ extern struct group_info init_groups;
        .rcu_read_lock_nesting = 0,                                     \
        .rcu_read_unlock_special = 0,                                   \
        .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
-       INIT_TASK_RCU_TREE_PREEMPT()
+       INIT_TASK_RCU_TREE_PREEMPT()                                    \
+       INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
index e7d1b2e..b78edb5 100644 (file)
@@ -275,7 +275,9 @@ extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
 extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
 extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
 extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
-extern int  arch_optimize_kprobe(struct optimized_kprobe *op);
+extern void arch_optimize_kprobes(struct list_head *oplist);
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                   struct list_head *done_list);
 extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
 extern kprobe_opcode_t *get_optinsn_slot(void);
 extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
index 06aab5e..c536f85 100644 (file)
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
  */
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#if defined(ARCH_HAS_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
 #include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
-extern void acpi_nmi_disable(void);
-extern void acpi_nmi_enable(void);
 #else
-#ifndef CONFIG_HARDLOCKUP_DETECTOR
 static inline void touch_nmi_watchdog(void)
 {
        touch_softlockup_watchdog();
 }
-#else
-extern void touch_nmi_watchdog(void);
-#endif
-static inline void acpi_nmi_disable(void) { }
-static inline void acpi_nmi_enable(void) { }
 #endif
 
 /*
index 4f1279e..dda5b0a 100644 (file)
@@ -215,8 +215,9 @@ struct perf_event_attr {
                                 */
                                precise_ip     :  2, /* skid constraint       */
                                mmap_data      :  1, /* non-exec mmap data    */
+                               sample_id_all  :  1, /* sample_type all events */
 
-                               __reserved_1   : 46;
+                               __reserved_1   : 45;
 
        union {
                __u32           wakeup_events;    /* wakeup every n events */
@@ -327,6 +328,15 @@ struct perf_event_header {
 enum perf_event_type {
 
        /*
+        * If perf_event_attr.sample_id_all is set then all event types will
+        * have the sample_type selected fields related to where/when
+        * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
+        * described in PERF_RECORD_SAMPLE below, it will be stashed just after
+        * the perf_event_header and the fields already present for the existing
+        * fields, i.e. at the end of the payload. That way a newer perf.data
+        * file will be supported by older perf tools, with these new optional
+        * fields being ignored.
+        *
         * The MMAP events record the PROT_EXEC mappings so that we can
         * correlate userspace IPs to code. They have the following structure:
         *
@@ -578,6 +588,10 @@ struct perf_event;
 struct pmu {
        struct list_head                entry;
 
+       struct device                   *dev;
+       char                            *name;
+       int                             type;
+
        int * __percpu                  pmu_disable_count;
        struct perf_cpu_context * __percpu pmu_cpu_context;
        int                             task_ctx_nr;
@@ -758,6 +772,9 @@ struct perf_event {
        u64                             shadow_ctx_time;
 
        struct perf_event_attr          attr;
+       u16                             header_size;
+       u16                             id_header_size;
+       u16                             read_size;
        struct hw_perf_event            hw;
 
        struct perf_event_context       *ctx;
@@ -903,7 +920,7 @@ struct perf_output_handle {
 
 #ifdef CONFIG_PERF_EVENTS
 
-extern int perf_pmu_register(struct pmu *pmu);
+extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern int perf_num_counters(void);
@@ -970,6 +987,11 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);
 
+static inline bool is_sampling_event(struct perf_event *event)
+{
+       return event->attr.sample_period != 0;
+}
+
 /*
  * Return 1 for a software event, 0 for a hardware event
  */
index f31ef61..2dea94f 100644 (file)
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_entry_rcu(ptr, type, member) \
        list_entry_rcu((ptr)->next, type, member)
 
-#define __list_for_each_rcu(pos, head) \
-       for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-               pos != (head); \
-               pos = rcu_dereference_raw(list_next_rcu((pos)))
-
 /**
  * list_for_each_entry_rcu     -       iterate over rcu list of given type
  * @pos:       the type * to use as a loop cursor.
index 03cda7b..af56148 100644 (file)
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+#define UINT_CMP_GE(a, b)      (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)      (UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
 
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 
 static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
index 13877cb..30ebd7c 100644 (file)
@@ -27,7 +27,9 @@
 
 #include <linux/cache.h>
 
-#define rcu_init_sched()       do { } while (0)
+static inline void rcu_init(void)
+{
+}
 
 #ifdef CONFIG_TINY_RCU
 
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
        synchronize_sched();
 }
 
+static inline void synchronize_sched_expedited(void)
+{
+       synchronize_sched();
+}
+
 #ifdef CONFIG_TINY_RCU
 
 static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 static inline void rcu_scheduler_starting(void)
 {
 }
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #endif /* __LINUX_RCUTINY_H */
index 95518e6..3a93348 100644 (file)
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
index 883ad10..777cd01 100644 (file)
@@ -316,6 +316,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
                                  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 extern int softlockup_thresh;
+void lockup_detector_init(void);
 #else
 static inline void touch_softlockup_watchdog(void)
 {
@@ -326,6 +327,9 @@ static inline void touch_softlockup_watchdog_sync(void)
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
+static inline void lockup_detector_init(void)
+{
+}
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
@@ -1234,6 +1238,9 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
        struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        struct sched_info sched_info;
@@ -1766,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #ifdef CONFIG_PREEMPT_RCU
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1774,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p)
        p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
        p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
index 51efbef..25310f1 100644 (file)
@@ -2,6 +2,7 @@
 #define __LINUX_STACKTRACE_H
 
 struct task_struct;
+struct pt_regs;
 
 #ifdef CONFIG_STACKTRACE
 struct task_struct;
@@ -13,7 +14,8 @@ struct stack_trace {
 };
 
 extern void save_stack_trace(struct stack_trace *trace);
-extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
+extern void save_stack_trace_regs(struct stack_trace *trace,
+                                 struct pt_regs *regs);
 extern void save_stack_trace_tsk(struct task_struct *tsk,
                                struct stack_trace *trace);
 
index cacc27a..18cd068 100644 (file)
@@ -127,8 +127,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #define SYSCALL_TRACE_ENTER_EVENT(sname)                               \
        static struct syscall_metadata                                  \
        __attribute__((__aligned__(4))) __syscall_meta_##sname;         \
-       static struct ftrace_event_call                                 \
-       __attribute__((__aligned__(4))) event_enter_##sname;            \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
@@ -137,13 +135,12 @@ extern struct trace_event_functions exit_syscall_print_funcs;
                .class                  = &event_class_syscall_enter,   \
                .event.funcs            = &enter_syscall_print_funcs,   \
                .data                   = (void *)&__syscall_meta_##sname,\
-       }
+       };                                                              \
+       __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)                                        \
        static struct syscall_metadata                                  \
        __attribute__((__aligned__(4))) __syscall_meta_##sname;         \
-       static struct ftrace_event_call                                 \
-       __attribute__((__aligned__(4))) event_exit_##sname;             \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
@@ -152,7 +149,8 @@ extern struct trace_event_functions exit_syscall_print_funcs;
                .class                  = &event_class_syscall_exit,    \
                .event.funcs            = &exit_syscall_print_funcs,    \
                .data                   = (void *)&__syscall_meta_##sname,\
-       }
+       };                                                              \
+       __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
 
 #define SYSCALL_METADATA(sname, nb)                            \
        SYSCALL_TRACE_ENTER_EVENT(sname);                       \
index a4a90b6..d3e4f87 100644 (file)
@@ -106,6 +106,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 
 #define TP_PROTO(args...)      args
 #define TP_ARGS(args...)       args
+#define TP_CONDITION(args...)  args
 
 #ifdef CONFIG_TRACEPOINTS
 
@@ -119,12 +120,14 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
  * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
  * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
  */
-#define __DO_TRACE(tp, proto, args)                                    \
+#define __DO_TRACE(tp, proto, args, cond)                              \
        do {                                                            \
                struct tracepoint_func *it_func_ptr;                    \
                void *it_func;                                          \
                void *__data;                                           \
                                                                        \
+               if (!(cond))                                            \
+                       return;                                         \
                rcu_read_lock_sched_notrace();                          \
                it_func_ptr = rcu_dereference_sched((tp)->funcs);       \
                if (it_func_ptr) {                                      \
@@ -142,7 +145,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)        \
        extern struct tracepoint __tracepoint_##name;                   \
        static inline void trace_##name(proto)                          \
        {                                                               \
@@ -151,7 +154,8 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 do_trace:                                                              \
                        __DO_TRACE(&__tracepoint_##name,                \
                                TP_PROTO(data_proto),                   \
-                               TP_ARGS(data_args));                    \
+                               TP_ARGS(data_args),                     \
+                               TP_CONDITION(cond));                    \
        }                                                               \
        static inline int                                               \
        register_trace_##name(void (*probe)(data_proto), void *data)    \
@@ -186,7 +190,7 @@ do_trace:                                                           \
        EXPORT_SYMBOL(__tracepoint_##name)
 
 #else /* !CONFIG_TRACEPOINTS */
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)        \
        static inline void trace_##name(proto)                          \
        { }                                                             \
        static inline int                                               \
@@ -227,13 +231,20 @@ do_trace:                                                         \
  * "void *__data, proto" as the callback prototype.
  */
 #define DECLARE_TRACE_NOARGS(name)                                     \
-               __DECLARE_TRACE(name, void, , void *__data, __data)
+               __DECLARE_TRACE(name, void, , 1, void *__data, __data)
 
 #define DECLARE_TRACE(name, proto, args)                               \
-               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
+               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,   \
                                PARAMS(void *__data, proto),            \
                                PARAMS(__data, args))
 
+#define DECLARE_TRACE_CONDITION(name, proto, args, cond)               \
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+                       PARAMS(void *__data, proto),                    \
+                       PARAMS(__data, args))
+
+#define TRACE_EVENT_FLAGS(event, flag)
+
 #endif /* DECLARE_TRACE */
 
 #ifndef TRACE_EVENT
@@ -347,11 +358,21 @@ do_trace:                                                         \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_CONDITION(template, name, proto,          \
+                              args, cond)                      \
+       DECLARE_TRACE_CONDITION(name, PARAMS(proto),            \
+                               PARAMS(args), PARAMS(cond))
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)  \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define TRACE_EVENT_FN(name, proto, args, struct,              \
                assign, print, reg, unreg)                      \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_CONDITION(name, proto, args, cond,         \
+                             struct, assign, print)            \
+       DECLARE_TRACE_CONDITION(name, PARAMS(proto),            \
+                               PARAMS(args), PARAMS(cond))
+
+#define TRACE_EVENT_FLAGS(event, flag)
 
 #endif /* ifdef TRACE_EVENT (see note above) */
index 1dfab54..b0b4eb2 100644 (file)
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
        DEFINE_TRACE(name)
 
+#undef TRACE_EVENT_CONDITION
+#define TRACE_EVENT_CONDITION(name, proto, args, cond, tstruct, assign, print) \
+       TRACE_EVENT(name,                                               \
+               PARAMS(proto),                                          \
+               PARAMS(args),                                           \
+               PARAMS(tstruct),                                        \
+               PARAMS(assign),                                         \
+               PARAMS(print))
+
 #undef TRACE_EVENT_FN
 #define TRACE_EVENT_FN(name, proto, args, tstruct,             \
                assign, print, reg, unreg)                      \
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
        DEFINE_TRACE(name)
 
+#undef DEFINE_EVENT_CONDITION
+#define DEFINE_EVENT_CONDITION(template, name, proto, args, cond) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)       \
        DEFINE_TRACE(name)
 
 #undef TRACE_EVENT
 #undef TRACE_EVENT_FN
+#undef TRACE_EVENT_CONDITION
 #undef DECLARE_EVENT_CLASS
 #undef DEFINE_EVENT
 #undef DEFINE_EVENT_PRINT
+#undef DEFINE_EVENT_CONDITION
 #undef TRACE_HEADER_MULTI_READ
 #undef DECLARE_TRACE
 
index 286784d..1bcc2a8 100644 (file)
@@ -7,16 +7,67 @@
 #include <linux/ktime.h>
 #include <linux/tracepoint.h>
 
-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
-       POWER_NONE      = 0,
-       POWER_CSTATE    = 1,    /* C-State */
-       POWER_PSTATE    = 2,    /* Fequency change or DVFS */
-       POWER_SSTATE    = 3,    /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+       TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(state, cpu_id),
+
+       TP_STRUCT__entry(
+               __field(        u32,            state           )
+               __field(        u32,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+       ),
+
+       TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+                 (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+       TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
 #endif
 
+DEFINE_EVENT(cpu, cpu_frequency,
+
+       TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+       TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+       TP_PROTO(unsigned int state),
+
+       TP_ARGS(state),
+
+       TP_STRUCT__entry(
+               __field(        u32,            state           )
+       ),
+
+       TP_fast_assign(
+               __entry->state = state;
+       ),
+
+       TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
 /*
  * The power events are used for cpuidle & suspend (power_start, power_end)
  *  and for cpufreq (power_frequency)
@@ -75,6 +126,36 @@ TRACE_EVENT(power_end,
 
 );
 
+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+
+/* These dummy declaration have to be ripped out when the deprecated
+   events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
 /*
  * The clock events are used for clock enable/disable and for
  *  clock rate change
@@ -153,7 +234,6 @@ DEFINE_EVENT(power_domain, power_domain_target,
 
        TP_ARGS(name, state, cpu_id)
 );
-
 #endif /* _TRACE_POWER_H */
 
 /* This part must be outside protection */
index fb726ac..5a4c04a 100644 (file)
@@ -40,6 +40,8 @@ TRACE_EVENT_FN(sys_enter,
        syscall_regfunc, syscall_unregfunc
 );
 
+TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
+
 TRACE_EVENT_FN(sys_exit,
 
        TP_PROTO(struct pt_regs *regs, long ret),
@@ -62,6 +64,8 @@ TRACE_EVENT_FN(sys_exit,
        syscall_regfunc, syscall_unregfunc
 );
 
+TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY)
+
 #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
 
 #endif /* _TRACE_EVENTS_SYSCALLS_H */
index a9377c0..e16610c 100644 (file)
        TRACE_EVENT(name, PARAMS(proto), PARAMS(args),                  \
                PARAMS(tstruct), PARAMS(assign), PARAMS(print))         \
 
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(name, value)                                 \
+       __TRACE_EVENT_FLAGS(name, value)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
        DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
 
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(event, flag)
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -289,13 +296,19 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {    \
 
 #undef __array
 #define __array(type, item, len)                                       \
-       BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
-       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+       do {                                                            \
+               mutex_lock(&event_storage_mutex);                       \
+               BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+               snprintf(event_storage, sizeof(event_storage),          \
+                        "%s[%d]", #type, len);                         \
+               ret = trace_define_field(event_call, event_storage, #item, \
                                 offsetof(typeof(field), item),         \
                                 sizeof(field.item),                    \
                                 is_signed_type(type), FILTER_OTHER);   \
-       if (ret)                                                        \
-               return ret;
+               mutex_unlock(&event_storage_mutex);                     \
+               if (ret)                                                \
+                       return ret;                                     \
+       } while (0);
 
 #undef __dynamic_array
 #define __dynamic_array(type, item, len)                                      \
index 62b7149..8dfd094 100644 (file)
@@ -393,7 +393,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
        bool "Enable tracing for RCU"
-       depends on TREE_RCU || TREE_PREEMPT_RCU
        help
          This option provides tracing in RCU which presents stats
          in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
          TREE_PREEMPT_RCU implementations, permitting Makefile to
          trivially select kernel/rcutree_trace.c.
 
+config RCU_BOOST
+       bool "Enable RCU priority boosting"
+       depends on RT_MUTEXES && TINY_PREEMPT_RCU
+       default n
+       help
+         This option boosts the priority of preempted RCU readers that
+         block the current preemptible RCU grace period for too long.
+         This option also prevents heavy loads from blocking RCU
+         callback invocation for all flavors of RCU.
+
+         Say Y here if you are working with real-time apps or heavy loads
+         Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+       int "Real-time priority to boost RCU readers to"
+       range 1 99
+       depends on RCU_BOOST
+       default 1
+       help
+         This option specifies the real-time priority to which preempted
+         RCU readers are to be boosted.  If you are working with CPU-bound
+         real-time applications, you should specify a priority higher then
+         the highest-priority CPU-bound application.
+
+         Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+       int "Milliseconds to delay boosting after RCU grace-period start"
+       range 0 3000
+       depends on RCU_BOOST
+       default 500
+       help
+         This option specifies the time to wait after the beginning of
+         a given grace period before priority-boosting preempted RCU
+         readers blocking that grace period.  Note that any RCU reader
+         blocking an expedited RCU grace period is boosted immediately.
+
+         Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+       int "Microseconds to delay before waiting for readers"
+       range 0 20
+       default 10
+       help
+         This option controls how long SRCU delays before entering its
+         loop waiting on SRCU readers.  The purpose of this loop is
+         to avoid the unconditional context-switch penalty that would
+         otherwise be incurred if there was an active SRCU reader,
+         in a manner similar to adaptive locking schemes.  This should
+         be set to be a bit longer than the common-case SRCU read-side
+         critical-section overhead.
+
+         Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
index 8646401..ea51770 100644 (file)
@@ -67,6 +67,7 @@
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
+#include <linux/perf_event.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -603,6 +604,8 @@ asmlinkage void __init start_kernel(void)
                                "enabled *very* early, fixing it\n");
                local_irq_disable();
        }
+       idr_init_cache();
+       perf_event_init();
        rcu_init();
        radix_tree_init();
        /* init some links before init_ISA_irqs() */
@@ -658,7 +661,6 @@ asmlinkage void __init start_kernel(void)
        enable_debug_pagealloc();
        kmemleak_init();
        debug_objects_mem_init();
-       idr_init_cache();
        setup_per_cpu_pageset();
        numa_policy_init();
        if (late_time_init)
@@ -882,6 +884,7 @@ static int __init kernel_init(void * unused)
        smp_prepare_cpus(setup_max_cpus);
 
        do_pre_smp_initcalls();
+       lockup_detector_init();
 
        smp_init();
        sched_init_smp();
index 40a8777..3019b92 100644 (file)
@@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED           0x01
+#define FLAGS_CLOCKRT          0x02
+#define FLAGS_HAS_TIMEOUT      0x04
+
+/*
  * Priority Inheritance state:
  */
 struct futex_pi_state {
@@ -123,6 +131,12 @@ struct futex_q {
        u32 bitset;
 };
 
+static const struct futex_q futex_q_init = {
+       /* list gets initialized in queue_me()*/
+       .key = FUTEX_KEY_INIT,
+       .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
 /*
  * Hash buckets are shared by all the futex_keys that hash to the same
  * location.  Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
        return 0;
 }
 
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
 {
        drop_futex_key_refs(key);
 }
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 /*
  * Wake up waiters matching bitset queued on this futex (uaddr).
  */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        }
 
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 out:
        return ret;
 }
@@ -917,7 +931,7 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 
 retry:
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -962,11 +976,11 @@ retry_private:
                if (ret)
                        goto out_put_keys;
 
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                        goto retry_private;
 
-               put_futex_key(fshared, &key2);
-               put_futex_key(fshared, &key1);
+               put_futex_key(&key2);
+               put_futex_key(&key1);
                goto retry;
        }
 
@@ -996,9 +1010,9 @@ retry_private:
 
        double_unlock_hb(hb1, hb2);
 out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
 out:
        return ret;
 }
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 /**
  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  * @uaddr1:    source futex user address
- * @fshared:   0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
  * @uaddr2:    target futex user address
  * @nr_wake:   number of waiters to wake (must be 1 for requeue_pi)
  * @nr_requeue:        number of waiters to requeue (0-INT_MAX)
  * @cmpval:    @uaddr1 expected value (or %NULL)
  * @requeue_pi:        if we are attempting to requeue from a non-pi futex to a
- *             pi futex (pi to pi requeue is not supported)
+ *             pi futex (pi to pi requeue is not supported)
  *
  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
  * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  * >=0 - on success, the number of tasks requeued or woken
  *  <0 - on error
  */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                        int nr_wake, int nr_requeue, u32 *cmpval,
-                        int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+                        u32 __user *uaddr2, int nr_wake, int nr_requeue,
+                        u32 *cmpval, int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
                pi_state = NULL;
        }
 
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1216,11 +1230,11 @@ retry_private:
                        if (ret)
                                goto out_put_keys;
 
-                       if (!fshared)
+                       if (!(flags & FLAGS_SHARED))
                                goto retry_private;
 
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        goto retry;
                }
                if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
                        break;
                case -EFAULT:
                        double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        ret = fault_in_user_writeable(uaddr2);
                        if (!ret)
                                goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
                case -EAGAIN:
                        /* The owner was exiting, try again. */
                        double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                        cond_resched();
                        goto retry;
                default:
@@ -1352,9 +1366,9 @@ out_unlock:
                drop_futex_key_refs(&key1);
 
 out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
 out:
        if (pi_state != NULL)
                free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner, int fshared)
+                               struct task_struct *newowner)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
        goto retry;
 }
 
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED           0x01
-#define FLAGS_CLOCKRT          0x02
-#define FLAGS_HAS_TIMEOUT      0x04
-
 static long futex_wait_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
  * @uaddr:     user address of the futex
- * @fshared:   whether the futex is shared (1) or not (0)
  * @q:         futex_q (contains pi_state and access to the rt_mutex)
  * @locked:    if the attempt to take the rt_mutex succeeded (1) or not (0)
  *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
  *  0 - success, lock not taken
  * <0 - on error (-EFAULT)
  */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
-                      int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
        struct task_struct *owner;
        int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * did a lock-steal - fix up the PI-state in that case:
                 */
                if (q->pi_state->owner != current)
-                       ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                       ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
 
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                 * lock. Fix the state up.
                 */
                owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-               ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+               ret = fixup_pi_state_owner(uaddr, q, owner);
                goto out;
        }
 
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  * futex_wait_setup() - Prepare to wait on a futex
  * @uaddr:     the futex userspace address
  * @val:       the expected value
- * @fshared:   whether the futex is shared (1) or not (0)
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
  * @q:         the associated futex_q
  * @hb:                storage for hash_bucket pointer to be returned to caller
  *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  *  0 - uaddr contains val and hb has been locked
  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
  */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
 {
        u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         * rare, but normal.
         */
 retry:
-       q->key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q->key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
        if (unlikely(ret != 0))
                return ret;
 
@@ -1769,10 +1772,10 @@ retry_private:
                if (ret)
                        goto out;
 
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                        goto retry_private;
 
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
                goto retry;
        }
 
@@ -1783,32 +1786,29 @@ retry_private:
 
 out:
        if (ret)
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
        return ret;
 }
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+                     ktime_t *abs_time, u32 bitset)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
        int ret;
 
        if (!bitset)
                return -EINVAL;
-
-       q.pi_state = NULL;
        q.bitset = bitset;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
 
        if (abs_time) {
                to = &timeout;
 
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
         * Prepare to wait on uaddr. On success, holds hb lock and increments
         * q.key refs.
         */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out;
 
@@ -1852,12 +1852,7 @@ retry:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-       restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
-       if (fshared)
-               restart->futex.flags |= FLAGS_SHARED;
-       if (clockrt)
-               restart->futex.flags |= FLAGS_CLOCKRT;
+       restart->futex.flags = flags;
 
        ret = -ERESTART_RESTARTBLOCK;
 
@@ -1873,7 +1868,6 @@ out:
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = restart->futex.uaddr;
-       int fshared = 0;
        ktime_t t, *tp = NULL;
 
        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;
-       if (restart->futex.flags & FLAGS_SHARED)
-               fshared = 1;
-       return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
-                               restart->futex.bitset,
-                               restart->futex.flags & FLAGS_CLOCKRT);
+
+       return (long)futex_wait(uaddr, restart->futex.flags,
+                               restart->futex.val, tp, restart->futex.bitset);
 }
 
 
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
-                        int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+                        ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
        int res, ret;
 
        if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                hrtimer_set_expires(&to->timer, *time);
        }
 
-       q.pi_state = NULL;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
 retry:
-       q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -1941,7 +1929,7 @@ retry_private:
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                       put_futex_key(fshared, &q.key);
+                       put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
                default:
@@ -1971,7 +1959,7 @@ retry_private:
         * Fixup the pi_state owner and possibly acquire the lock if we
         * haven't already.
         */
-       res = fixup_owner(uaddr, fshared, &q, !ret);
+       res = fixup_owner(uaddr, &q, !ret);
        /*
         * If fixup_owner() returned an error, proprogate that.  If it acquired
         * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
        queue_unlock(&q, hb);
 
 out_put_key:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
        if (ret)
                goto out_put_key;
 
-       if (!fshared)
+       if (!(flags & FLAGS_SHARED))
                goto retry_private;
 
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
        goto retry;
 }
 
@@ -2020,7 +2008,7 @@ uaddr_faulted:
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2093,14 +2081,14 @@ retry:
 
 out_unlock:
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 
 out:
        return ret;
 
 pi_faulted:
        spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
 
        ret = fault_in_user_writeable(uaddr);
        if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 /**
  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
  * @uaddr:     the futex we initially wait on (non-pi)
- * @fshared:   whether the futexes are shared (1) or not (0).  They must be
+ * @flags:     futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
  *             the same type, no requeueing from private to shared, etc.
  * @val:       the expected value of uaddr
  * @abs_time:  absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  *  0 - On success
  * <0 - On error
  */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 val, ktime_t *abs_time, u32 bitset,
-                                int clockrt, u32 __user *uaddr2)
+                                u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
        struct futex_hash_bucket *hb;
-       union futex_key key2;
-       struct futex_q q;
+       union futex_key key2 = FUTEX_KEY_INIT;
+       struct futex_q q = futex_q_init;
        int res, ret;
 
        if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
        if (abs_time) {
                to = &timeout;
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
                hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                             current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        debug_rt_mutex_init_waiter(&rt_waiter);
        rt_waiter.task = NULL;
 
-       key2 = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
        if (unlikely(ret != 0))
                goto out;
 
-       q.pi_state = NULL;
        q.bitset = bitset;
        q.rt_waiter = &rt_waiter;
        q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         * Prepare to wait on uaddr. On success, increments q.key (key1) ref
         * count.
         */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                goto out_key2;
 
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 */
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
-                       ret = fixup_pi_state_owner(uaddr2, &q, current,
-                                                  fshared);
+                       ret = fixup_pi_state_owner(uaddr2, &q, current);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                 * Fixup the pi_state owner and possibly acquire the lock if we
                 * haven't already.
                 */
-               res = fixup_owner(uaddr2, fshared, &q, !ret);
+               res = fixup_owner(uaddr2, &q, !ret);
                /*
                 * If fixup_owner() returned an error, proprogate that.  If it
                 * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        }
 
 out_put_keys:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
 out_key2:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
 
 out:
        if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
-       int clockrt, ret = -ENOSYS;
-       int cmd = op & FUTEX_CMD_MASK;
-       int fshared = 0;
+       int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+       unsigned int flags = 0;
 
        if (!(op & FUTEX_PRIVATE_FLAG))
-               fshared = 1;
+               flags |= FLAGS_SHARED;
 
-       clockrt = op & FUTEX_CLOCK_REALTIME;
-       if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
-               return -ENOSYS;
+       if (op & FUTEX_CLOCK_REALTIME) {
+               flags |= FLAGS_CLOCKRT;
+               if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                       return -ENOSYS;
+       }
 
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-               ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+               ret = futex_wait(uaddr, flags, val, timeout, val3);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAKE_BITSET:
-               ret = futex_wake(uaddr, fshared, val, val3);
+               ret = futex_wake(uaddr, flags, val, val3);
                break;
        case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-                                   0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
                break;
        case FUTEX_WAKE_OP:
-               ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+               ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
                if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                       ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                break;