Merge branch 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Linus Torvalds [Tue, 15 Sep 2009 00:43:43 +0000 (17:43 -0700)]
* 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (202 commits)
  MAINTAINERS: update KVM entry
  KVM: correct error-handling code
  KVM: fix compile warnings on s390
  KVM: VMX: Check cpl before emulating debug register access
  KVM: fix misreporting of coalesced interrupts by kvm tracer
  KVM: x86: drop duplicate kvm_flush_remote_tlb calls
  KVM: VMX: call vmx_load_host_state() only if msr is cached
  KVM: VMX: Conditionally reload debug register 6
  KVM: Use thread debug register storage instead of kvm specific data
  KVM guest: do not batch pte updates from interrupt context
  KVM: Fix coalesced interrupt reporting in IOAPIC
  KVM guest: fix bogus wallclock physical address calculation
  KVM: VMX: Fix cr8 exiting control clobbering by EPT
  KVM: Optimize kvm_mmu_unprotect_page_virt() for tdp
  KVM: Document KVM_CAP_IRQCHIP
  KVM: Protect update_cr8_intercept() when running without an apic
  KVM: VMX: Fix EPT with WP bit change during paging
  KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors
  KVM: x86 emulator: Add adc and sbb missing decoder flags
  KVM: Add missing #include
  ...

80 files changed:
Documentation/ioctl/ioctl-number.txt
Documentation/kernel-parameters.txt
Documentation/kvm/api.txt [new file with mode: 0644]
MAINTAINERS
arch/ia64/include/asm/kvm_host.h
arch/ia64/include/asm/kvm_para.h
arch/ia64/kvm/Kconfig
arch/ia64/kvm/kvm-ia64.c
arch/ia64/kvm/vcpu.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/kvm/44x.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500_tlb.c
arch/powerpc/kvm/e500_tlb.h
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/trace.h [new file with mode: 0644]
arch/s390/include/asm/kvm.h
arch/s390/include/asm/kvm_host.h
arch/s390/include/asm/kvm_para.h
arch/s390/kvm/Kconfig
arch/s390/kvm/gaccess.h
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/sigp.c
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/kvm.h
arch/x86/include/asm/kvm_emulate.h [moved from arch/x86/include/asm/kvm_x86_emulate.h with 100% similarity]
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/vmx.h
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/emulate.c [moved from arch/x86/kvm/x86_emulate.c with 90% similarity]
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_cache_regs.h
arch/x86/kvm/kvm_svm.h [deleted file]
arch/x86/kvm/kvm_timer.h
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmutrace.h [new file with mode: 0644]
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/timer.c
arch/x86/kvm/trace.h [new file with mode: 0644]
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
arch/x86/mm/highmem_32.c
include/asm-generic/Kbuild.asm
include/linux/Kbuild
include/linux/kvm.h
include/linux/kvm_host.h
include/linux/kvm_para.h
include/trace/events/kvm.h [new file with mode: 0644]
mm/hugetlb.c
virt/kvm/Kconfig [new file with mode: 0644]
virt/kvm/coalesced_mmio.c
virt/kvm/coalesced_mmio.h
virt/kvm/eventfd.c [new file with mode: 0644]
virt/kvm/ioapic.c
virt/kvm/iodev.h
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c
virt/kvm/kvm_trace.c [deleted file]

index 1c058b5..aafca0a 100644 (file)
@@ -193,7 +193,7 @@ Code        Seq#    Include File            Comments
 0xAD   00      Netfilter device        in development:
                                        <mailto:rusty@rustcorp.com.au>  
 0xAE   all     linux/kvm.h             Kernel-based Virtual Machine
-                                       <mailto:kvm-devel@lists.sourceforge.net>
+                                       <mailto:kvm@vger.kernel.org>
 0xB0   all     RATIO devices           in development:
                                        <mailto:vgo@ratio.de>
 0xB1   00-1F   PPPoX                   <mailto:mostrows@styx.uwaterloo.ca>
index cb3a169..3a23864 100644 (file)
@@ -57,6 +57,7 @@ parameter is applicable:
        ISAPNP  ISA PnP code is enabled.
        ISDN    Appropriate ISDN support is enabled.
        JOY     Appropriate joystick support is enabled.
+       KVM     Kernel Virtual Machine support is enabled.
        LIBATA  Libata driver is enabled
        LP      Printer support is enabled.
        LOOP    Loopback device support is enabled.
@@ -1098,6 +1099,44 @@ and is between 256 and 4096 characters. It is defined in the file
        kstack=N        [X86] Print N words from the kernel stack
                        in oops dumps.
 
+       kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
+                       Default is 0 (don't ignore, but inject #GP)
+
+       kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
+                       Default is 1 (enabled)
+
+       kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
+                       Default is 0 (off)
+
+       kvm-amd.npt=    [KVM,AMD] Disable nested paging (virtualized MMU)
+                       for all guests.
+                       Default is 1 (enabled) if in 64bit or 32bit-PAE mode
+
+       kvm-intel.bypass_guest_pf=
+                       [KVM,Intel] Disables bypassing of guest page faults
+                       on Intel chips. Default is 1 (enabled)
+
+       kvm-intel.ept=  [KVM,Intel] Disable extended page tables
+                       (virtualized MMU) support on capable Intel chips.
+                       Default is 1 (enabled)
+
+       kvm-intel.emulate_invalid_guest_state=
+                       [KVM,Intel] Enable emulation of invalid guest states
+                       Default is 0 (disabled)
+
+       kvm-intel.flexpriority=
+                       [KVM,Intel] Disable FlexPriority feature (TPR shadow).
+                       Default is 1 (enabled)
+
+       kvm-intel.unrestricted_guest=
+                       [KVM,Intel] Disable unrestricted guest feature
+                       (virtualized real and unpaged mode) on capable
+                       Intel chips. Default is 1 (enabled)
+
+       kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
+                       feature (tagged TLBs) on capable Intel chips.
+                       Default is 1 (enabled)
+
        l2cr=           [PPC]
 
        l3cr=           [PPC]
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
new file mode 100644 (file)
index 0000000..5a4bc8c
--- /dev/null
@@ -0,0 +1,759 @@
+The Definitive KVM (Kernel-based Virtual Machine) API Documentation
+===================================================================
+
+1. General description
+
+The kvm API is a set of ioctls that are issued to control various aspects
+of a virtual machine.  The ioctls belong to three classes
+
+ - System ioctls: These query and set global attributes which affect the
+   whole kvm subsystem.  In addition a system ioctl is used to create
+   virtual machines
+
+ - VM ioctls: These query and set attributes that affect an entire virtual
+   machine, for example memory layout.  In addition a VM ioctl is used to
+   create virtual cpus (vcpus).
+
+   Only run VM ioctls from the same process (address space) that was used
+   to create the VM.
+
+ - vcpu ioctls: These query and set attributes that control the operation
+   of a single virtual cpu.
+
+   Only run vcpu ioctls from the same thread that was used to create the
+   vcpu.
+
+2. File descritpors
+
+The kvm API is centered around file descriptors.  An initial
+open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
+can be used to issue system ioctls.  A KVM_CREATE_VM ioctl on this
+handle will create a VM file descripror which can be used to issue VM
+ioctls.  A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
+and return a file descriptor pointing to it.  Finally, ioctls on a vcpu
+fd can be used to control the vcpu, including the important task of
+actually running guest code.
+
+In general file descriptors can be migrated among processes by means
+of fork() and the SCM_RIGHTS facility of unix domain socket.  These
+kinds of tricks are explicitly not supported by kvm.  While they will
+not cause harm to the host, their actual behavior is not guaranteed by
+the API.  The only supported use is one virtual machine per process,
+and one vcpu per thread.
+
+3. Extensions
+
+As of Linux 2.6.22, the KVM ABI has been stabilized: no backward
+incompatible change are allowed.  However, there is an extension
+facility that allows backward-compatible extensions to the API to be
+queried and used.
+
+The extension mechanism is not based on on the Linux version number.
+Instead, kvm defines extension identifiers and a facility to query
+whether a particular extension identifier is available.  If it is, a
+set of ioctls is available for application use.
+
+4. API description
+
+This section describes ioctls that can be used to control kvm guests.
+For each ioctl, the following information is provided along with a
+description:
+
+  Capability: which KVM extension provides this ioctl.  Can be 'basic',
+      which means that is will be provided by any kernel that supports
+      API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which
+      means availability needs to be checked with KVM_CHECK_EXTENSION
+      (see section 4.4).
+
+  Architectures: which instruction set architectures provide this ioctl.
+      x86 includes both i386 and x86_64.
+
+  Type: system, vm, or vcpu.
+
+  Parameters: what parameters are accepted by the ioctl.
+
+  Returns: the return value.  General error numbers (EBADF, ENOMEM, EINVAL)
+      are not detailed, but errors with specific meanings are.
+
+4.1 KVM_GET_API_VERSION
+
+Capability: basic
+Architectures: all
+Type: system ioctl
+Parameters: none
+Returns: the constant KVM_API_VERSION (=12)
+
+This identifies the API version as the stable kvm API. It is not
+expected that this number will change.  However, Linux 2.6.20 and
+2.6.21 report earlier versions; these are not documented and not
+supported.  Applications should refuse to run if KVM_GET_API_VERSION
+returns a value other than 12.  If this check passes, all ioctls
+described as 'basic' will be available.
+
+4.2 KVM_CREATE_VM
+
+Capability: basic
+Architectures: all
+Type: system ioctl
+Parameters: none
+Returns: a VM fd that can be used to control the new virtual machine.
+
+The new VM has no virtual cpus and no memory.  An mmap() of a VM fd
+will access the virtual machine's physical address space; offset zero
+corresponds to guest physical address zero.  Use of mmap() on a VM fd
+is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
+available.
+
+4.3 KVM_GET_MSR_INDEX_LIST
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_msr_list (in/out)
+Returns: 0 on success; -1 on error
+Errors:
+  E2BIG:     the msr index list is to be to fit in the array specified by
+             the user.
+
+struct kvm_msr_list {
+       __u32 nmsrs; /* number of msrs in entries */
+       __u32 indices[0];
+};
+
+This ioctl returns the guest msrs that are supported.  The list varies
+by kvm version and host processor, but does not change otherwise.  The
+user fills in the size of the indices array in nmsrs, and in return
+kvm adjusts nmsrs to reflect the actual number of msrs and fills in
+the indices array with their numbers.
+
+4.4 KVM_CHECK_EXTENSION
+
+Capability: basic
+Architectures: all
+Type: system ioctl
+Parameters: extension identifier (KVM_CAP_*)
+Returns: 0 if unsupported; 1 (or some other positive integer) if supported
+
+The API allows the application to query about extensions to the core
+kvm API.  Userspace passes an extension identifier (an integer) and
+receives an integer that describes the extension availability.
+Generally 0 means no and 1 means yes, but some extensions may report
+additional information in the integer return value.
+
+4.5 KVM_GET_VCPU_MMAP_SIZE
+
+Capability: basic
+Architectures: all
+Type: system ioctl
+Parameters: none
+Returns: size of vcpu mmap area, in bytes
+
+The KVM_RUN ioctl (cf.) communicates with userspace via a shared
+memory region.  This ioctl returns the size of that region.  See the
+KVM_RUN documentation for details.
+
+4.6 KVM_SET_MEMORY_REGION
+
+Capability: basic
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_memory_region (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_memory_region {
+       __u32 slot;
+       __u32 flags;
+       __u64 guest_phys_addr;
+       __u64 memory_size; /* bytes */
+};
+
+/* for kvm_memory_region::flags */
+#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+
+This ioctl allows the user to create or modify a guest physical memory
+slot.  When changing an existing slot, it may be moved in the guest
+physical memory space, or its flags may be modified.  It may not be
+resized.  Slots may not overlap.
+
+The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+instructs kvm to keep track of writes to memory within the slot.  See
+the KVM_GET_DIRTY_LOG ioctl.
+
+It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
+of this API, if available.  This newer API allows placing guest memory
+at specified locations in the host address space, yielding better
+control and easy access.
+
+4.6 KVM_CREATE_VCPU
+
+Capability: basic
+Architectures: all
+Type: vm ioctl
+Parameters: vcpu id (apic id on x86)
+Returns: vcpu fd on success, -1 on error
+
+This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
+in the range [0, max_vcpus).
+
+4.7 KVM_GET_DIRTY_LOG (vm ioctl)
+
+Capability: basic
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_dirty_log (in/out)
+Returns: 0 on success, -1 on error
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+       __u32 slot;
+       __u32 padding;
+       union {
+               void __user *dirty_bitmap; /* one bit per page */
+               __u64 padding;
+       };
+};
+
+Given a memory slot, return a bitmap containing any pages dirtied
+since the last call to this ioctl.  Bit 0 is the first page in the
+memory slot.  Ensure the entire structure is cleared to avoid padding
+issues.
+
+4.8 KVM_SET_MEMORY_ALIAS
+
+Capability: basic
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_memory_alias (in)
+Returns: 0 (success), -1 (error)
+
+struct kvm_memory_alias {
+       __u32 slot;  /* this has a different namespace than memory slots */
+       __u32 flags;
+       __u64 guest_phys_addr;
+       __u64 memory_size;
+       __u64 target_phys_addr;
+};
+
+Defines a guest physical address space region as an alias to another
+region.  Useful for aliased address, for example the VGA low memory
+window. Should not be used with userspace memory.
+
+4.9 KVM_RUN
+
+Capability: basic
+Architectures: all
+Type: vcpu ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+Errors:
+  EINTR:     an unmasked signal is pending
+
+This ioctl is used to run a guest virtual cpu.  While there are no
+explicit parameters, there is an implicit parameter block that can be
+obtained by mmap()ing the vcpu fd at offset 0, with the size given by
+KVM_GET_VCPU_MMAP_SIZE.  The parameter block is formatted as a 'struct
+kvm_run' (see below).
+
+4.10 KVM_GET_REGS
+
+Capability: basic
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_regs (out)
+Returns: 0 on success, -1 on error
+
+Reads the general purpose registers from the vcpu.
+
+/* x86 */
+struct kvm_regs {
+       /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+       __u64 rax, rbx, rcx, rdx;
+       __u64 rsi, rdi, rsp, rbp;
+       __u64 r8,  r9,  r10, r11;
+       __u64 r12, r13, r14, r15;
+       __u64 rip, rflags;
+};
+
+4.11 KVM_SET_REGS
+
+Capability: basic
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_regs (in)
+Returns: 0 on success, -1 on error
+
+Writes the general purpose registers into the vcpu.
+
+See KVM_GET_REGS for the data structure.
+
+4.12 KVM_GET_SREGS
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_sregs (out)
+Returns: 0 on success, -1 on error
+
+Reads special registers from the vcpu.
+
+/* x86 */
+struct kvm_sregs {
+       struct kvm_segment cs, ds, es, fs, gs, ss;
+       struct kvm_segment tr, ldt;
+       struct kvm_dtable gdt, idt;
+       __u64 cr0, cr2, cr3, cr4, cr8;
+       __u64 efer;
+       __u64 apic_base;
+       __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+interrupt_bitmap is a bitmap of pending external interrupts.  At most
+one bit may be set.  This interrupt has been acknowledged by the APIC
+but not yet injected into the cpu core.
+
+4.13 KVM_SET_SREGS
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_sregs (in)
+Returns: 0 on success, -1 on error
+
+Writes special registers into the vcpu.  See KVM_GET_SREGS for the
+data structures.
+
+4.14 KVM_TRANSLATE
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_translation (in/out)
+Returns: 0 on success, -1 on error
+
+Translates a virtual address according to the vcpu's current address
+translation mode.
+
+struct kvm_translation {
+       /* in */
+       __u64 linear_address;
+
+       /* out */
+       __u64 physical_address;
+       __u8  valid;
+       __u8  writeable;
+       __u8  usermode;
+       __u8  pad[5];
+};
+
+4.15 KVM_INTERRUPT
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_interrupt (in)
+Returns: 0 on success, -1 on error
+
+Queues a hardware interrupt vector to be injected.  This is only
+useful if in-kernel local APIC is not used.
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+       /* in */
+       __u32 irq;
+};
+
+Note 'irq' is an interrupt vector, not an interrupt pin or line.
+
+4.16 KVM_DEBUG_GUEST
+
+Capability: basic
+Architectures: none
+Type: vcpu ioctl
+Parameters: none)
+Returns: -1 on error
+
+Support for this has been removed.  Use KVM_SET_GUEST_DEBUG instead.
+
+4.17 KVM_GET_MSRS
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_msrs (in/out)
+Returns: 0 on success, -1 on error
+
+Reads model-specific registers from the vcpu.  Supported msr indices can
+be obtained using KVM_GET_MSR_INDEX_LIST.
+
+struct kvm_msrs {
+       __u32 nmsrs; /* number of msrs in entries */
+       __u32 pad;
+
+       struct kvm_msr_entry entries[0];
+};
+
+struct kvm_msr_entry {
+       __u32 index;
+       __u32 reserved;
+       __u64 data;
+};
+
+Application code should set the 'nmsrs' member (which indicates the
+size of the entries array) and the 'index' member of each array entry.
+kvm will fill in the 'data' member.
+
+4.18 KVM_SET_MSRS
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_msrs (in)
+Returns: 0 on success, -1 on error
+
+Writes model-specific registers to the vcpu.  See KVM_GET_MSRS for the
+data structures.
+
+Application code should set the 'nmsrs' member (which indicates the
+size of the entries array), and the 'index' and 'data' members of each
+array entry.
+
+4.19 KVM_SET_CPUID
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_cpuid (in)
+Returns: 0 on success, -1 on error
+
+Defines the vcpu responses to the cpuid instruction.  Applications
+should use the KVM_SET_CPUID2 ioctl if available.
+
+
+struct kvm_cpuid_entry {
+       __u32 function;
+       __u32 eax;
+       __u32 ebx;
+       __u32 ecx;
+       __u32 edx;
+       __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+       __u32 nent;
+       __u32 padding;
+       struct kvm_cpuid_entry entries[0];
+};
+
+4.20 KVM_SET_SIGNAL_MASK
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_signal_mask (in)
+Returns: 0 on success, -1 on error
+
+Defines which signals are blocked during execution of KVM_RUN.  This
+signal mask temporarily overrides the threads signal mask.  Any
+unblocked signal received (except SIGKILL and SIGSTOP, which retain
+their traditional behaviour) will cause KVM_RUN to return with -EINTR.
+
+Note the signal will only be delivered if not blocked by the original
+signal mask.
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+       __u32 len;
+       __u8  sigset[0];
+};
+
+4.21 KVM_GET_FPU
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_fpu (out)
+Returns: 0 on success, -1 on error
+
+Reads the floating point state from the vcpu.
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+       __u8  fpr[8][16];
+       __u16 fcw;
+       __u16 fsw;
+       __u8  ftwx;  /* in fxsave format */
+       __u8  pad1;
+       __u16 last_opcode;
+       __u64 last_ip;
+       __u64 last_dp;
+       __u8  xmm[16][16];
+       __u32 mxcsr;
+       __u32 pad2;
+};
+
+4.22 KVM_SET_FPU
+
+Capability: basic
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_fpu (in)
+Returns: 0 on success, -1 on error
+
+Writes the floating point state to the vcpu.
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+       __u8  fpr[8][16];
+       __u16 fcw;
+       __u16 fsw;
+       __u8  ftwx;  /* in fxsave format */
+       __u8  pad1;
+       __u16 last_opcode;
+       __u64 last_ip;
+       __u64 last_dp;
+       __u8  xmm[16][16];
+       __u32 mxcsr;
+       __u32 pad2;
+};
+
+4.23 KVM_CREATE_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+
+Creates an interrupt controller model in the kernel.  On x86, creates a virtual
+ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
+local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
+only go to the IOAPIC.  On ia64, a IOSAPIC is created.
+
+4.24 KVM_IRQ_LINE
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irq_level
+Returns: 0 on success, -1 on error
+
+Sets the level of a GSI input to the interrupt controller model in the kernel.
+Requires that an interrupt controller model has been previously created with
+KVM_CREATE_IRQCHIP.  Note that edge-triggered interrupts require the level
+to be set to 1 and then back to 0.
+
+struct kvm_irq_level {
+       union {
+               __u32 irq;     /* GSI */
+               __s32 status;  /* not used for KVM_IRQ_LEVEL */
+       };
+       __u32 level;           /* 0 or 1 */
+};
+
+4.25 KVM_GET_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irqchip (in/out)
+Returns: 0 on success, -1 on error
+
+Reads the state of a kernel interrupt controller created with
+KVM_CREATE_IRQCHIP into a buffer provided by the caller.
+
+struct kvm_irqchip {
+       __u32 chip_id;  /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
+       __u32 pad;
+        union {
+               char dummy[512];  /* reserving space */
+               struct kvm_pic_state pic;
+               struct kvm_ioapic_state ioapic;
+       } chip;
+};
+
+4.26 KVM_SET_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irqchip (in)
+Returns: 0 on success, -1 on error
+
+Sets the state of a kernel interrupt controller created with
+KVM_CREATE_IRQCHIP from a buffer provided by the caller.
+
+struct kvm_irqchip {
+       __u32 chip_id;  /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
+       __u32 pad;
+        union {
+               char dummy[512];  /* reserving space */
+               struct kvm_pic_state pic;
+               struct kvm_ioapic_state ioapic;
+       } chip;
+};
+
+5. The kvm_run structure
+
+Application code obtains a pointer to the kvm_run structure by
+mmap()ing a vcpu fd.  From that point, application code can control
+execution by changing fields in kvm_run prior to calling the KVM_RUN
+ioctl, and obtain information about the reason KVM_RUN returned by
+looking up structure members.
+
+struct kvm_run {
+       /* in */
+       __u8 request_interrupt_window;
+
+Request that KVM_RUN return when it becomes possible to inject external
+interrupts into the guest.  Useful in conjunction with KVM_INTERRUPT.
+
+       __u8 padding1[7];
+
+       /* out */
+       __u32 exit_reason;
+
+When KVM_RUN has returned successfully (return value 0), this informs
+application code why KVM_RUN has returned.  Allowable values for this
+field are detailed below.
+
+       __u8 ready_for_interrupt_injection;
+
+If request_interrupt_window has been specified, this field indicates
+an interrupt can be injected now with KVM_INTERRUPT.
+
+       __u8 if_flag;
+
+The value of the current interrupt flag.  Only valid if in-kernel
+local APIC is not used.
+
+       __u8 padding2[2];
+
+       /* in (pre_kvm_run), out (post_kvm_run) */
+       __u64 cr8;
+
+The value of the cr8 register.  Only valid if in-kernel local APIC is
+not used.  Both input and output.
+
+       __u64 apic_base;
+
+The value of the APIC BASE msr.  Only valid if in-kernel local
+APIC is not used.  Both input and output.
+
+       union {
+               /* KVM_EXIT_UNKNOWN */
+               struct {
+                       __u64 hardware_exit_reason;
+               } hw;
+
+If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown
+reasons.  Further architecture-specific information is available in
+hardware_exit_reason.
+
+               /* KVM_EXIT_FAIL_ENTRY */
+               struct {
+                       __u64 hardware_entry_failure_reason;
+               } fail_entry;
+
+If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
+to unknown reasons.  Further architecture-specific information is
+available in hardware_entry_failure_reason.
+
+               /* KVM_EXIT_EXCEPTION */
+               struct {
+                       __u32 exception;
+                       __u32 error_code;
+               } ex;
+
+Unused.
+
+               /* KVM_EXIT_IO */
+               struct {
+#define KVM_EXIT_IO_IN  0
+#define KVM_EXIT_IO_OUT 1
+                       __u8 direction;
+                       __u8 size; /* bytes */
+                       __u16 port;
+                       __u32 count;
+                       __u64 data_offset; /* relative to kvm_run start */
+               } io;
+
+If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has
+executed a port I/O instruction which could not be satisfied by kvm.
+data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
+where kvm expects application code to place the data for the next
+KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a patcked array.
+
+               struct {
+                       struct kvm_debug_exit_arch arch;
+               } debug;
+
+Unused.
+
+               /* KVM_EXIT_MMIO */
+               struct {
+                       __u64 phys_addr;
+                       __u8  data[8];
+                       __u32 len;
+                       __u8  is_write;
+               } mmio;
+
+If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has
+executed a memory-mapped I/O instruction which could not be satisfied
+by kvm.  The 'data' member contains the written data if 'is_write' is
+true, and should be filled by application code otherwise.
+
+               /* KVM_EXIT_HYPERCALL */
+               struct {
+                       __u64 nr;
+                       __u64 args[6];
+                       __u64 ret;
+                       __u32 longmode;
+                       __u32 pad;
+               } hypercall;
+
+Unused.
+
+               /* KVM_EXIT_TPR_ACCESS */
+               struct {
+                       __u64 rip;
+                       __u32 is_write;
+                       __u32 pad;
+               } tpr_access;
+
+To be documented (KVM_TPR_ACCESS_REPORTING).
+
+               /* KVM_EXIT_S390_SIEIC */
+               struct {
+                       __u8 icptcode;
+                       __u64 mask; /* psw upper half */
+                       __u64 addr; /* psw lower half */
+                       __u16 ipa;
+                       __u32 ipb;
+               } s390_sieic;
+
+s390 specific.
+
+               /* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+               __u64 s390_reset_flags;
+
+s390 specific.
+
+               /* KVM_EXIT_DCR */
+               struct {
+                       __u32 dcrn;
+                       __u32 data;
+                       __u8  is_write;
+               } dcr;
+
+powerpc specific.
+
+               /* Fix the size of the union. */
+               char padding[256];
+       };
+};
index e95cb77..1516936 100644 (file)
@@ -2926,6 +2926,7 @@ F:        include/linux/sunrpc/
 
 KERNEL VIRTUAL MACHINE (KVM)
 M:     Avi Kivity <avi@redhat.com>
+M:     Marcelo Tosatti <mtosatti@redhat.com>
 L:     kvm@vger.kernel.org
 W:     http://kvm.qumranet.com
 S:     Supported
index 5f43697..d9b6325 100644 (file)
@@ -235,7 +235,8 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G          32
 #define KVM_REQ_RESUME         33
 
-#define KVM_PAGES_PER_HPAGE    1
+#define KVM_NR_PAGE_SIZES      1
+#define KVM_PAGES_PER_HPAGE(x) 1
 
 struct kvm;
 struct kvm_vcpu;
@@ -465,7 +466,6 @@ struct kvm_arch {
        unsigned long   metaphysical_rr4;
        unsigned long   vmm_init_rr;
 
-       int             online_vcpus;
        int             is_sn2;
 
        struct kvm_ioapic *vioapic;
index 0d6d8ca..1588aee 100644 (file)
  *
  */
 
+#ifdef __KERNEL__
+
 static inline unsigned int kvm_arch_para_features(void)
 {
        return 0;
 }
 
 #endif
+
+#endif
index 64d5209..ef3e7be 100644 (file)
@@ -1,12 +1,8 @@
 #
 # KVM configuration
 #
-config HAVE_KVM
-       bool
 
-config HAVE_KVM_IRQCHIP
-       bool
-       default y
+source "virt/kvm/Kconfig"
 
 menuconfig VIRTUALIZATION
        bool "Virtualization"
@@ -28,6 +24,8 @@ config KVM
        depends on PCI
        select PREEMPT_NOTIFIERS
        select ANON_INODES
+       select HAVE_KVM_IRQCHIP
+       select KVM_APIC_ARCHITECTURE
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
@@ -49,9 +47,6 @@ config KVM_INTEL
          Provides support for KVM on Itanium 2 processors equipped with the VT
          extensions.
 
-config KVM_TRACE
-       bool
-
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
index 80c57b0..0ad09f0 100644 (file)
@@ -210,16 +210,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 
 }
 
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                       gpa_t addr, int len, int is_write)
-{
-       struct kvm_io_device *dev;
-
-       dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
-
-       return dev;
-}
-
 static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -231,6 +221,7 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct kvm_mmio_req *p;
        struct kvm_io_device *mmio_dev;
+       int r;
 
        p = kvm_get_vcpu_ioreq(vcpu);
 
@@ -247,16 +238,13 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        kvm_run->exit_reason = KVM_EXIT_MMIO;
        return 0;
 mmio:
-       mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir);
-       if (mmio_dev) {
-               if (!p->dir)
-                       kvm_iodevice_write(mmio_dev, p->addr, p->size,
-                                               &p->data);
-               else
-                       kvm_iodevice_read(mmio_dev, p->addr, p->size,
-                                               &p->data);
-
-       } else
+       if (p->dir)
+               r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr,
+                                   p->size, &p->data);
+       else
+               r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr,
+                                    p->size, &p->data);
+       if (r)
                printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
        p->state = STATE_IORESP_READY;
 
@@ -337,13 +325,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
 {
        union ia64_lid lid;
        int i;
+       struct kvm_vcpu *vcpu;
 
-       for (i = 0; i < kvm->arch.online_vcpus; i++) {
-               if (kvm->vcpus[i]) {
-                       lid.val = VCPU_LID(kvm->vcpus[i]);
-                       if (lid.id == id && lid.eid == eid)
-                               return kvm->vcpus[i];
-               }
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               lid.val = VCPU_LID(vcpu);
+               if (lid.id == id && lid.eid == eid)
+                       return vcpu;
        }
 
        return NULL;
@@ -409,21 +396,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        struct kvm *kvm = vcpu->kvm;
        struct call_data call_data;
        int i;
+       struct kvm_vcpu *vcpui;
 
        call_data.ptc_g_data = p->u.ptc_g_data;
 
-       for (i = 0; i < kvm->arch.online_vcpus; i++) {
-               if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
-                                               KVM_MP_STATE_UNINITIALIZED ||
-                                       vcpu == kvm->vcpus[i])
+       kvm_for_each_vcpu(i, vcpui, kvm) {
+               if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED ||
+                               vcpu == vcpui)
                        continue;
 
-               if (waitqueue_active(&kvm->vcpus[i]->wq))
-                       wake_up_interruptible(&kvm->vcpus[i]->wq);
+               if (waitqueue_active(&vcpui->wq))
+                       wake_up_interruptible(&vcpui->wq);
 
-               if (kvm->vcpus[i]->cpu != -1) {
-                       call_data.vcpu = kvm->vcpus[i];
-                       smp_call_function_single(kvm->vcpus[i]->cpu,
+               if (vcpui->cpu != -1) {
+                       call_data.vcpu = vcpui;
+                       smp_call_function_single(vcpui->cpu,
                                        vcpu_global_purge, &call_data, 1);
                } else
                        printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n");
@@ -852,8 +839,6 @@ struct  kvm *kvm_arch_create_vm(void)
 
        kvm_init_vm(kvm);
 
-       kvm->arch.online_vcpus = 0;
-
        return kvm;
 
 }
@@ -1000,10 +985,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
-                       mutex_lock(&kvm->lock);
+                       mutex_lock(&kvm->irq_lock);
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                    irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->lock);
+                       mutex_unlock(&kvm->irq_lock);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
@@ -1216,7 +1201,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (IS_ERR(vmm_vcpu))
                return PTR_ERR(vmm_vcpu);
 
-       if (vcpu->vcpu_id == 0) {
+       if (kvm_vcpu_is_bsp(vcpu)) {
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
                /*Set entry address for first run.*/
@@ -1224,7 +1209,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
                /*Initialize itc offset for vcpus*/
                itc_offset = 0UL - kvm_get_itc(vcpu);
-               for (i = 0; i < kvm->arch.online_vcpus; i++) {
+               for (i = 0; i < KVM_MAX_VCPUS; i++) {
                        v = (struct kvm_vcpu *)((char *)vcpu +
                                        sizeof(struct kvm_vcpu_data) * i);
                        v->arch.itc_offset = itc_offset;
@@ -1356,8 +1341,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                goto fail;
        }
 
-       kvm->arch.online_vcpus++;
-
        return vcpu;
 fail:
        return ERR_PTR(r);
@@ -1952,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu)
     return find_highest_bits((int *)&vpd->irr[0]);
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
-{
-       if (kvm_highest_pending_irq(vcpu) != -1)
-               return 1;
-       return 0;
-}
-
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-       /* do real check here */
-       return 1;
-}
-
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.timer_fired;
@@ -1977,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE;
+       return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
+               (kvm_highest_pending_irq(vcpu) != -1);
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
index cc406d0..dce75b7 100644 (file)
@@ -830,8 +830,8 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 
        kvm = (struct kvm *)KVM_VM_BASE;
 
-       if (vcpu->vcpu_id == 0) {
-               for (i = 0; i < kvm->arch.online_vcpus; i++) {
+       if (kvm_vcpu_is_bsp(vcpu)) {
+               for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
                        v = (struct kvm_vcpu *)((char *)vcpu +
                                        sizeof(struct kvm_vcpu_data) * i);
                        VMX(v, itc_offset) = itc_offset;
index fddc3ed..c9c930e 100644 (file)
@@ -34,7 +34,8 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 /* We don't currently support large pages. */
-#define KVM_PAGES_PER_HPAGE (1UL << 31)
+#define KVM_NR_PAGE_SIZES      1
+#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
 
 struct kvm;
 struct kvm_run;
@@ -153,7 +154,6 @@ struct kvm_vcpu_arch {
        u32 pid;
        u32 swap_pid;
 
-       u32 pvr;
        u32 ccr0;
        u32 ccr1;
        u32 dbcr0;
index 0cef809..f4d1b55 100644 (file)
@@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
-static int kvmppc_44x_init(void)
+static int __init kvmppc_44x_init(void)
 {
        int r;
 
@@ -149,7 +149,7 @@ static int kvmppc_44x_init(void)
        return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
 }
 
-static void kvmppc_44x_exit(void)
+static void __exit kvmppc_44x_exit(void)
 {
        kvmppc_booke_exit();
 }
index 4a16f47..ff3cb63 100644 (file)
@@ -30,6 +30,7 @@
 #include "timing.h"
 
 #include "44x_tlb.h"
+#include "trace.h"
 
 #ifndef PPC44x_TLBE_SIZE
 #define PPC44x_TLBE_SIZE       PPC44x_TLB_4K
@@ -263,7 +264,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
 
        /* XXX set tlb_44x_index to stlb_index? */
 
-       KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler);
+       trace_kvm_stlb_inval(stlb_index);
 }
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
@@ -365,8 +366,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
        /* Insert shadow mapping into hardware TLB. */
        kvmppc_44x_tlbe_set_modified(vcpu_44x, victim);
        kvmppc_44x_tlbwe(victim, &stlbe);
-       KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1,
-                   stlbe.word2, handler);
+       trace_kvm_stlb_write(victim, stlbe.tid, stlbe.word0, stlbe.word1,
+                            stlbe.word2);
 }
 
 /* For a particular guest TLB entry, invalidate the corresponding host TLB
@@ -485,8 +486,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
                kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
        }
 
-       KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
-                   tlbe->word1, tlbe->word2, handler);
+       trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1,
+                            tlbe->word2);
 
        kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
        return EMULATE_DONE;
index 5a152a5..c299268 100644 (file)
@@ -2,8 +2,7 @@
 # KVM configuration
 #
 
-config HAVE_KVM_IRQCHIP
-       bool
+source "virt/kvm/Kconfig"
 
 menuconfig VIRTUALIZATION
        bool "Virtualization"
@@ -59,17 +58,6 @@ config KVM_E500
 
          If unsure, say N.
 
-config KVM_TRACE
-       bool "KVM trace support"
-       depends on KVM && MARKERS && SYSFS
-       select RELAY
-       select DEBUG_FS
-       default n
-       ---help---
-         This option allows reading a trace of kvm-related events through
-         relayfs.  Note the ABI is not considered stable and will be
-         modified in future updates.
-
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
index 459c7ee..37655fe 100644 (file)
@@ -8,7 +8,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
 
 common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
-common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+CFLAGS_44x_tlb.o  := -I.
+CFLAGS_e500_tlb.o := -I.
+CFLAGS_emulate.o  := -I.
 
 kvm-objs := $(common-objs-y) powerpc.o emulate.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
index 642e420..e7bf4d0 100644 (file)
@@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return kvmppc_core_vcpu_translate(vcpu, tr);
 }
 
-int kvmppc_booke_init(void)
+int __init kvmppc_booke_init(void)
 {
        unsigned long ivor[16];
        unsigned long max_ivor = 0;
index d8067fd..64949ee 100644 (file)
@@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 
        kvmppc_e500_tlb_setup(vcpu_e500);
 
-       /* Use the same core vertion as host's */
-       vcpu->arch.pvr = mfspr(SPRN_PVR);
-
        return 0;
 }
 
@@ -132,7 +129,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
-static int kvmppc_e500_init(void)
+static int __init kvmppc_e500_init(void)
 {
        int r, i;
        unsigned long ivor[3];
@@ -160,7 +157,7 @@ static int kvmppc_e500_init(void)
        return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
 }
 
-static void kvmppc_e500_exit(void)
+static void __init kvmppc_e500_exit(void)
 {
        kvmppc_booke_exit();
 }
index 3f76041..be95b8d 100644 (file)
@@ -180,6 +180,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
        case SPRN_MMUCSR0:
                vcpu->arch.gpr[rt] = 0; break;
 
+       case SPRN_MMUCFG:
+               vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break;
+
        /* extra exceptions */
        case SPRN_IVOR32:
                vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
index 0e773fc..fb1e1dc 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "../mm/mmu_decl.h"
 #include "e500_tlb.h"
+#include "trace.h"
 
 #define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
 
@@ -224,9 +225,8 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
 
        kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
        stlbe->mas1 = 0;
-       KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel),
-                       stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7,
-                       handler);
+       trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
+                            stlbe->mas3, stlbe->mas7);
 }
 
 static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -269,7 +269,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
        tlbsel = (vcpu_e500->mas4 >> 28) & 0x1;
        victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
        pidsel = (vcpu_e500->mas4 >> 16) & 0xf;
-       tsized = (vcpu_e500->mas4 >> 8) & 0xf;
+       tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
 
        vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
                | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
@@ -309,7 +309,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
        vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
 
        /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
-       stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K)
+       stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
                | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
        stlbe->mas2 = (gvaddr & MAS2_EPN)
                | e500_shadow_mas2_attrib(gtlbe->mas2,
@@ -319,9 +319,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
                                vcpu_e500->vcpu.arch.msr & MSR_PR);
        stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
 
-       KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel),
-                       stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7,
-                       handler);
+       trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
+                            stlbe->mas3, stlbe->mas7);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
@@ -535,9 +534,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
        gtlbe->mas3 = vcpu_e500->mas3;
        gtlbe->mas7 = vcpu_e500->mas7;
 
-       KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0,
-                       gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7,
-                       handler);
+       trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2,
+                            gtlbe->mas3, gtlbe->mas7);
 
        /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
        if (tlbe_is_host_safe(vcpu, gtlbe)) {
@@ -545,7 +543,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
                case 0:
                        /* TLB0 */
                        gtlbe->mas1 &= ~MAS1_TSIZE(~0);
-                       gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K);
+                       gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
 
                        stlbsel = 0;
                        sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
@@ -679,14 +677,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
 
        /* Insert large initial mapping for guest. */
        tlbe = &vcpu_e500->guest_tlb[1][0];
-       tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M);
+       tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
        tlbe->mas2 = 0;
        tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
        tlbe->mas7 = 0;
 
        /* 4K map for serial output. Used by kernel wrapper. */
        tlbe = &vcpu_e500->guest_tlb[1][1];
-       tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K);
+       tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
        tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
        tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
        tlbe->mas7 = 0;
index 45b064b..d28e301 100644 (file)
@@ -16,7 +16,7 @@
 #define __KVM_E500_TLB_H__
 
 #include <linux/kvm_host.h>
-#include <asm/mmu-fsl-booke.h>
+#include <asm/mmu-book3e.h>
 #include <asm/tlb.h>
 #include <asm/kvm_e500.h>
 
@@ -59,7 +59,7 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
 /* TLB helper functions */
 static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
 {
-       return (tlbe->mas1 >> 8) & 0xf;
+       return (tlbe->mas1 >> 7) & 0x1f;
 }
 
 static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
@@ -70,7 +70,7 @@ static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
 static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
 {
        unsigned int pgsize = get_tlb_size(tlbe);
-       return 1ULL << 10 << (pgsize << 1);
+       return 1ULL << 10 << pgsize;
 }
 
 static inline gva_t get_tlb_end(const struct tlbe *tlbe)
index a561d6e..7737146 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
 #include "timing.h"
+#include "trace.h"
 
 #define OP_TRAP 3
 
@@ -187,7 +188,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_SRR1:
                                vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
                        case SPRN_PVR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
+                               vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break;
+                       case SPRN_PIR:
+                               vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break;
 
                        /* Note: mftb and TBRL/TBWL are user-accessible, so
                         * the guest can always access the real TB anyways.
@@ -417,7 +420,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                }
        }
 
-       KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit);
+       trace_kvm_ppc_instr(inst, vcpu->arch.pc, emulated);
 
        if (advance)
                vcpu->arch.pc += 4; /* Advance past emulated instruction. */
index 2cf915e..2a4551f 100644 (file)
 #include "timing.h"
 #include "../mm/mmu_decl.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
        return gfn;
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
-{
-       return !!(v->arch.pending_exceptions);
-}
-
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-       /* do real check here */
-       return 1;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-       return !(v->arch.msr & MSR_WE);
+       return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
 }
 
 
@@ -122,13 +114,17 @@ struct kvm *kvm_arch_create_vm(void)
 static void kvmppc_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
+       struct kvm_vcpu *vcpu;
 
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_free(vcpu);
+
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
+
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
new file mode 100644 (file)
index 0000000..67f219d
--- /dev/null
@@ -0,0 +1,104 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_ppc_instr,
+       TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate),
+       TP_ARGS(inst, pc, emulate),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   inst            )
+               __field(        unsigned long,  pc              )
+               __field(        unsigned int,   emulate         )
+       ),
+
+       TP_fast_assign(
+               __entry->inst           = inst;
+               __entry->pc             = pc;
+               __entry->emulate        = emulate;
+       ),
+
+       TP_printk("inst %u pc 0x%lx emulate %u\n",
+                 __entry->inst, __entry->pc, __entry->emulate)
+);
+
+TRACE_EVENT(kvm_stlb_inval,
+       TP_PROTO(unsigned int stlb_index),
+       TP_ARGS(stlb_index),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   stlb_index      )
+       ),
+
+       TP_fast_assign(
+               __entry->stlb_index     = stlb_index;
+       ),
+
+       TP_printk("stlb_index %u", __entry->stlb_index)
+);
+
+TRACE_EVENT(kvm_stlb_write,
+       TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0,
+                unsigned int word1, unsigned int word2),
+       TP_ARGS(victim, tid, word0, word1, word2),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   victim          )
+               __field(        unsigned int,   tid             )
+               __field(        unsigned int,   word0           )
+               __field(        unsigned int,   word1           )
+               __field(        unsigned int,   word2           )
+       ),
+
+       TP_fast_assign(
+               __entry->victim         = victim;
+               __entry->tid            = tid;
+               __entry->word0          = word0;
+               __entry->word1          = word1;
+               __entry->word2          = word2;
+       ),
+
+       TP_printk("victim %u tid %u w0 %u w1 %u w2 %u",
+               __entry->victim, __entry->tid, __entry->word0,
+               __entry->word1, __entry->word2)
+);
+
+TRACE_EVENT(kvm_gtlb_write,
+       TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0,
+                unsigned int word1, unsigned int word2),
+       TP_ARGS(gtlb_index, tid, word0, word1, word2),
+
+       TP_STRUCT__entry(
+               __field(        unsigned int,   gtlb_index      )
+               __field(        unsigned int,   tid             )
+               __field(        unsigned int,   word0           )
+               __field(        unsigned int,   word1           )
+               __field(        unsigned int,   word2           )
+       ),
+
+       TP_fast_assign(
+               __entry->gtlb_index     = gtlb_index;
+               __entry->tid            = tid;
+               __entry->word0          = word0;
+               __entry->word1          = word1;
+               __entry->word2          = word2;
+       ),
+
+       TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u",
+               __entry->gtlb_index, __entry->tid, __entry->word0,
+               __entry->word1, __entry->word2)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 0b2f829..3dfcaeb 100644 (file)
  */
 #include <linux/types.h>
 
-/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
-struct kvm_pic_state {
-       /* no PIC for s390 */
-};
-
-struct kvm_ioapic_state {
-       /* no IOAPIC for s390 */
-};
-
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
        /* general purpose regs for s390 */
index 698988f..27605b6 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * asm-s390/kvm_host.h - definition for kernel virtual machines on s390
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -40,7 +40,11 @@ struct sca_block {
        struct sca_entry cpu[64];
 } __attribute__((packed));
 
-#define KVM_PAGES_PER_HPAGE 256
+#define KVM_NR_PAGE_SIZES 2
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
 #define CPUSTAT_HOST       0x80000000
 #define CPUSTAT_WAIT       0x10000000
@@ -182,8 +186,9 @@ struct kvm_s390_interrupt_info {
 };
 
 /* for local_interrupt.action_flags */
-#define ACTION_STORE_ON_STOP 1
-#define ACTION_STOP_ON_STOP  2
+#define ACTION_STORE_ON_STOP           (1<<0)
+#define ACTION_STOP_ON_STOP            (1<<1)
+#define ACTION_RELOADVCPU_ON_STOP      (1<<2)
 
 struct kvm_s390_local_interrupt {
        spinlock_t lock;
@@ -227,8 +232,6 @@ struct kvm_vm_stat {
 };
 
 struct kvm_arch{
-       unsigned long guest_origin;
-       unsigned long guest_memsize;
        struct sca_block *sca;
        debug_info_t *dbf;
        struct kvm_s390_float_interrupt float_int;
index 2c50379..6964db2 100644 (file)
@@ -13,6 +13,8 @@
 #ifndef __S390_KVM_PARA_H
 #define __S390_KVM_PARA_H
 
+#ifdef __KERNEL__
+
 /*
  * Hypercalls for KVM on s390. The calling convention is similar to the
  * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1
@@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void)
        return 0;
 }
 
+#endif
+
 #endif /* __S390_KVM_PARA_H */
index 3e260b7..bf164fc 100644 (file)
@@ -1,11 +1,7 @@
 #
 # KVM configuration
 #
-config HAVE_KVM
-       bool
-
-config HAVE_KVM_IRQCHIP
-       bool
+source "virt/kvm/Kconfig"
 
 menuconfig VIRTUALIZATION
        bool "Virtualization"
@@ -38,9 +34,6 @@ config KVM
 
          If unsure, say N.
 
-config KVM_TRACE
-       bool
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/virtio/Kconfig
index ed60f3a..03c716a 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * gaccess.h -  access guest memory
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
 #include <asm/uaccess.h>
+#include "kvm-s390.h"
 
 static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu,
                                               unsigned long guestaddr)
 {
        unsigned long prefix  = vcpu->arch.sie_block->prefix;
-       unsigned long origin  = vcpu->kvm->arch.guest_origin;
-       unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+       unsigned long origin  = vcpu->arch.sie_block->gmsor;
+       unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
 
        if (guestaddr < 2 * PAGE_SIZE)
                guestaddr += prefix;
@@ -158,8 +159,8 @@ static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest,
                                const void *from, unsigned long n)
 {
        unsigned long prefix  = vcpu->arch.sie_block->prefix;
-       unsigned long origin  = vcpu->kvm->arch.guest_origin;
-       unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+       unsigned long origin  = vcpu->arch.sie_block->gmsor;
+       unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
 
        if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
                goto slowpath;
@@ -209,8 +210,8 @@ static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
                                  unsigned long guestsrc, unsigned long n)
 {
        unsigned long prefix  = vcpu->arch.sie_block->prefix;
-       unsigned long origin  = vcpu->kvm->arch.guest_origin;
-       unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+       unsigned long origin  = vcpu->arch.sie_block->gmsor;
+       unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
 
        if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
                goto slowpath;
@@ -244,8 +245,8 @@ static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu,
                                         unsigned long guestdest,
                                         const void *from, unsigned long n)
 {
-       unsigned long origin  = vcpu->kvm->arch.guest_origin;
-       unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+       unsigned long origin  = vcpu->arch.sie_block->gmsor;
+       unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
 
        if (guestdest + n > memsize)
                return -EFAULT;
@@ -262,8 +263,8 @@ static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
                                           unsigned long guestsrc,
                                           unsigned long n)
 {
-       unsigned long origin  = vcpu->kvm->arch.guest_origin;
-       unsigned long memsize = vcpu->kvm->arch.guest_memsize;
+       unsigned long origin  = vcpu->arch.sie_block->gmsor;
+       unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
 
        if (guestsrc + n > memsize)
                return -EFAULT;
index 98997cc..ba9d8a7 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * intercept.c - in-kernel handling for sie intercepts
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -128,7 +128,7 @@ static int handle_noop(struct kvm_vcpu *vcpu)
 
 static int handle_stop(struct kvm_vcpu *vcpu)
 {
-       int rc;
+       int rc = 0;
 
        vcpu->stat.exit_stop_request++;
        atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@ -141,12 +141,18 @@ static int handle_stop(struct kvm_vcpu *vcpu)
                        rc = -ENOTSUPP;
        }
 
+       if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
+               vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
+               rc = SIE_INTERCEPT_RERUNVCPU;
+               vcpu->run->exit_reason = KVM_EXIT_INTR;
+       }
+
        if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
                vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
                VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
                rc = -ENOTSUPP;
-       } else
-               rc = 0;
+       }
+
        spin_unlock_bh(&vcpu->arch.local_int.lock);
        return rc;
 }
@@ -158,9 +164,9 @@ static int handle_validity(struct kvm_vcpu *vcpu)
 
        vcpu->stat.exit_validity++;
        if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
-               <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){
+               <= kvm_s390_vcpu_get_memsize(vcpu) - 2*PAGE_SIZE)) {
                rc = fault_in_pages_writeable((char __user *)
-                        vcpu->kvm->arch.guest_origin +
+                        vcpu->arch.sie_block->gmsor +
                         vcpu->arch.sie_block->prefix,
                         2*PAGE_SIZE);
                if (rc)
index 4d61341..2c2f983 100644 (file)
@@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
        return 1;
 }
 
-int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
+static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 {
        struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
        struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
        return rc;
 }
 
-int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-       /* do real check here */
-       return 1;
-}
-
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        return 0;
index 90d9d1b..07ced89 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * s390host.c --  hosting zSeries kernel virtual machines
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -10,6 +10,7 @@
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
  *               Heiko Carstens <heiko.carstens@de.ibm.com>
+ *               Christian Ehrhardt <ehrhardt@de.ibm.com>
  */
 
 #include <linux/compiler.h>
@@ -210,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 static void kvm_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
+       struct kvm_vcpu *vcpu;
 
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_destroy(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_destroy(vcpu);
+
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
+
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -278,16 +283,10 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.sie_block->gbea = 1;
 }
 
-/* The current code can have up to 256 pages for virtio */
-#define VIRTIODESCSPACE (256ul * 4096ul)
-
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
-       vcpu->arch.sie_block->gmslm = vcpu->kvm->arch.guest_memsize +
-                                     vcpu->kvm->arch.guest_origin +
-                                     VIRTIODESCSPACE - 1ul;
-       vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
+       set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
        vcpu->arch.sie_block->ecb   = 2;
        vcpu->arch.sie_block->eca   = 0xC1002001U;
        vcpu->arch.sie_block->fac   = (int) (long) facilities;
@@ -319,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        BUG_ON(!kvm->arch.sca);
        if (!kvm->arch.sca->cpu[id].sda)
                kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
-       else
-               BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
        vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
        vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 
@@ -490,9 +487,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        vcpu_load(vcpu);
 
+rerun_vcpu:
+       if (vcpu->requests)
+               if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+                       kvm_s390_vcpu_set_mem(vcpu);
+
        /* verify, that memory has been registered */
-       if (!vcpu->kvm->arch.guest_memsize) {
+       if (!vcpu->arch.sie_block->gmslm) {
                vcpu_put(vcpu);
+               VCPU_EVENT(vcpu, 3, "%s", "no memory registered to run vcpu");
                return -EINVAL;
        }
 
@@ -509,6 +512,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
                break;
        case KVM_EXIT_UNKNOWN:
+       case KVM_EXIT_INTR:
        case KVM_EXIT_S390_RESET:
                break;
        default:
@@ -522,8 +526,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                rc = kvm_handle_sie_intercept(vcpu);
        } while (!signal_pending(current) && !rc);
 
-       if (signal_pending(current) && !rc)
+       if (rc == SIE_INTERCEPT_RERUNVCPU)
+               goto rerun_vcpu;
+
+       if (signal_pending(current) && !rc) {
+               kvm_run->exit_reason = KVM_EXIT_INTR;
                rc = -EINTR;
+       }
 
        if (rc == -ENOTSUPP) {
                /* intercept cannot be handled in-kernel, prepare kvm-run */
@@ -676,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                                int user_alloc)
 {
        int i;
+       struct kvm_vcpu *vcpu;
 
        /* A few sanity checks. We can have exactly one memory slot which has
           to start at guest virtual zero and which has to be located at a
@@ -684,7 +694,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
           vmas. It is okay to mmap() and munmap() stuff in this slot after
           doing this call at any time */
 
-       if (mem->slot || kvm->arch.guest_memsize)
+       if (mem->slot)
                return -EINVAL;
 
        if (mem->guest_phys_addr)
@@ -699,36 +709,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
        if (!user_alloc)
                return -EINVAL;
 
-       /* lock all vcpus */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (!kvm->vcpus[i])
+       /* request update of sie control block for all available vcpus */
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
                        continue;
-               if (!mutex_trylock(&kvm->vcpus[i]->mutex))
-                       goto fail_out;
-       }
-
-       kvm->arch.guest_origin = mem->userspace_addr;
-       kvm->arch.guest_memsize = mem->memory_size;
-
-       /* update sie control blocks, and unlock all vcpus */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm->vcpus[i]->arch.sie_block->gmsor =
-                               kvm->arch.guest_origin;
-                       kvm->vcpus[i]->arch.sie_block->gmslm =
-                               kvm->arch.guest_memsize +
-                               kvm->arch.guest_origin +
-                               VIRTIODESCSPACE - 1ul;
-                       mutex_unlock(&kvm->vcpus[i]->mutex);
-               }
+               kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
        }
 
        return 0;
-
-fail_out:
-       for (; i >= 0; i--)
-               mutex_unlock(&kvm->vcpus[i]->mutex);
-       return -EINVAL;
 }
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
index 748fee8..ec5eee7 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * kvm_s390.h -  definition for kvm on s390
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
+ *               Christian Ehrhardt <ehrhardt@de.ibm.com>
  */
 
 #ifndef ARCH_S390_KVM_S390_H
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 
+/* The current code can have up to 256 pages for virtio */
+#define VIRTIODESCSPACE (256ul * 4096ul)
+
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
+/* negativ values are error codes, positive values for internal conditions */
+#define SIE_INTERCEPT_RERUNVCPU                (1<<0)
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 
 #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -50,6 +56,30 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
                struct kvm_s390_interrupt *s390int);
 int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
+
+static inline int kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.sie_block->gmslm
+               - vcpu->arch.sie_block->gmsor
+               - VIRTIODESCSPACE + 1ul;
+}
+
+static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
+{
+       struct kvm_memory_slot *mem;
+
+       down_read(&vcpu->kvm->slots_lock);
+       mem = &vcpu->kvm->memslots[0];
+
+       vcpu->arch.sie_block->gmsor = mem->userspace_addr;
+       vcpu->arch.sie_block->gmslm =
+               mem->userspace_addr +
+               (mem->npages << PAGE_SHIFT) +
+               VIRTIODESCSPACE - 1ul;
+
+       up_read(&vcpu->kvm->slots_lock);
+}
 
 /* implemented in priv.c */
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
index 0ef81d6..40c8c67 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * sigp.c - handlinge interprocessor communication
  *
- * Copyright IBM Corp. 2008
+ * Copyright IBM Corp. 2008,2009
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
+ *               Christian Ehrhardt <ehrhardt@de.ibm.com>
  */
 
 #include <linux/kvm.h>
@@ -107,46 +108,57 @@ unlock:
        return rc;
 }
 
-static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
+static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
 {
-       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
-       struct kvm_s390_local_interrupt *li;
        struct kvm_s390_interrupt_info *inti;
-       int rc;
-
-       if (cpu_addr >= KVM_MAX_VCPUS)
-               return 3; /* not operational */
 
        inti = kzalloc(sizeof(*inti), GFP_KERNEL);
        if (!inti)
                return -ENOMEM;
-
        inti->type = KVM_S390_SIGP_STOP;
 
-       spin_lock(&fi->lock);
-       li = fi->local_int[cpu_addr];
-       if (li == NULL) {
-               rc = 3; /* not operational */
-               kfree(inti);
-               goto unlock;
-       }
        spin_lock_bh(&li->lock);
        list_add_tail(&inti->list, &li->list);
        atomic_set(&li->active, 1);
        atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
-       if (store)
-               li->action_bits |= ACTION_STORE_ON_STOP;
-       li->action_bits |= ACTION_STOP_ON_STOP;
+       li->action_bits |= action;
        if (waitqueue_active(&li->wq))
                wake_up_interruptible(&li->wq);
        spin_unlock_bh(&li->lock);
-       rc = 0; /* order accepted */
+
+       return 0; /* order accepted */
+}
+
+static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
+{
+       struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
+       struct kvm_s390_local_interrupt *li;
+       int rc;
+
+       if (cpu_addr >= KVM_MAX_VCPUS)
+               return 3; /* not operational */
+
+       spin_lock(&fi->lock);
+       li = fi->local_int[cpu_addr];
+       if (li == NULL) {
+               rc = 3; /* not operational */
+               goto unlock;
+       }
+
+       rc = __inject_sigp_stop(li, action);
+
 unlock:
        spin_unlock(&fi->lock);
        VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
        return rc;
 }
 
+int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action)
+{
+       struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
+       return __inject_sigp_stop(li, action);
+}
+
 static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
 {
        int rc;
@@ -177,9 +189,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
        /* make sure that the new value is valid memory */
        address = address & 0x7fffe000u;
        if ((copy_from_guest(vcpu, &tmp,
-               (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) ||
+               (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) ||
           (copy_from_guest(vcpu, &tmp, (u64) (address +
-                       vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) {
+                       vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) {
                *reg |= SIGP_STAT_INVALID_PARAMETER;
                return 1; /* invalid parameter */
        }
@@ -262,11 +274,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
                break;
        case SIGP_STOP:
                vcpu->stat.instruction_sigp_stop++;
-               rc = __sigp_stop(vcpu, cpu_addr, 0);
+               rc = __sigp_stop(vcpu, cpu_addr, ACTION_STOP_ON_STOP);
                break;
        case SIGP_STOP_STORE_STATUS:
                vcpu->stat.instruction_sigp_stop++;
-               rc = __sigp_stop(vcpu, cpu_addr, 1);
+               rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP);
                break;
        case SIGP_SET_ARCH:
                vcpu->stat.instruction_sigp_arch++;
index 7386bfa..3b62da9 100644 (file)
@@ -15,6 +15,7 @@
 
 #define        APIC_LVR        0x30
 #define                APIC_LVR_MASK           0xFF00FF
+#define                APIC_LVR_DIRECTED_EOI   (1 << 24)
 #define                GET_APIC_VERSION(x)     ((x) & 0xFFu)
 #define                GET_APIC_MAXLVT(x)      (((x) >> 16) & 0xFFu)
 #ifdef CONFIG_X86_32
@@ -41,6 +42,7 @@
 #define                APIC_DFR_CLUSTER                0x0FFFFFFFul
 #define                APIC_DFR_FLAT                   0xFFFFFFFFul
 #define        APIC_SPIV       0xF0
+#define                APIC_SPIV_DIRECTED_EOI          (1 << 12)
 #define                APIC_SPIV_FOCUS_DISABLED        (1 << 9)
 #define                APIC_SPIV_APIC_ENABLED          (1 << 8)
 #define        APIC_ISR        0x100
index 125be8b..4a5fe91 100644 (file)
@@ -17,6 +17,8 @@
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
 #define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -236,6 +238,14 @@ struct kvm_pit_state {
        struct kvm_pit_channel_state channels[3];
 };
 
+#define KVM_PIT_FLAGS_HPET_LEGACY  0x00000001
+
+struct kvm_pit_state2 {
+       struct kvm_pit_channel_state channels[3];
+       __u32 flags;
+       __u32 reserved[9];
+};
+
 struct kvm_reinject_control {
        __u8 pit_reinject;
        __u8 reserved[31];
index eabdc1c..3be0004 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |   \
                                  0xFFFFFF0000000000ULL)
 
-#define KVM_GUEST_CR0_MASK                                \
-       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
-        | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
+       (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK                                             \
+       (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST                                \
+       (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
-       (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
-        | X86_CR0_MP)
+       (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 #define KVM_GUEST_CR4_MASK                                             \
        (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
-/* shadow tables are PAE even on non-PAE hosts */
-#define KVM_HPAGE_SHIFT 21
-#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
-#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
-
-#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES      3
+#define KVM_HPAGE_SHIFT(x)     (PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_SIZE(x)      (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x)      (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
        NR_VCPU_REGS
 };
 
+enum kvm_reg_ex {
+       VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
 enum {
        VCPU_SREG_ES,
        VCPU_SREG_CS,
@@ -131,7 +138,7 @@ enum {
        VCPU_SREG_LDTR,
 };
 
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
 
 #define KVM_NR_MEM_OBJS 40
 
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch {
        struct {
                gfn_t gfn;      /* presumed gfn during guest pte update */
                pfn_t pfn;      /* pfn corresponding to that gfn */
-               int largepage;
                unsigned long mmu_seq;
        } update_pte;
 
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch {
                u8 nr;
        } interrupt;
 
-       struct {
-               int vm86_active;
-               u8 save_iopl;
-               struct kvm_save_segment {
-                       u16 selector;
-                       unsigned long base;
-                       u32 limit;
-                       u32 ar;
-               } tr, es, ds, fs, gs;
-       } rmode;
        int halt_request; /* real mode on Intel only */
 
        int cpuid_nent;
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch {
        u32 pat;
 
        int switch_db_regs;
-       unsigned long host_db[KVM_NR_DB_REGS];
-       unsigned long host_dr6;
-       unsigned long host_dr7;
        unsigned long db[KVM_NR_DB_REGS];
        unsigned long dr6;
        unsigned long dr7;
        unsigned long eff_db[KVM_NR_DB_REGS];
+
+       u64 mcg_cap;
+       u64 mcg_status;
+       u64 mcg_ctl;
+       u64 *mce_banks;
 };
 
 struct kvm_mem_alias {
@@ -409,6 +407,7 @@ struct kvm_arch{
 
        struct page *ept_identity_pagetable;
        bool ept_identity_pagetable_done;
+       gpa_t ept_identity_map_addr;
 
        unsigned long irq_sources_bitmap;
        unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
@@ -526,6 +525,9 @@ struct kvm_x86_ops {
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+       bool (*gb_page_enable)(void);
+
+       const struct trace_print_flags *exit_reasons_str;
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
                           u32 error_code);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
 int kvm_pic_set_irq(void *opaque, int irq, int level);
 
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-#define MSR_IA32_TIME_STAMP_COUNTER            0x010
-
 #define TSS_IOPB_BASE_OFFSET 0x66
 #define TSS_BASE_SIZE 0x68
 #define TSS_IOPB_SIZE (65536 / 8)
@@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
 #endif /* _ASM_X86_KVM_HOST_H */
index b8a3305..c584076 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_KVM_PARA_H
 #define _ASM_X86_KVM_PARA_H
 
+#include <linux/types.h>
+
 /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
  * should be used to determine that a VM is running under KVM.
  */
index 6be7fc2..bd55490 100644 (file)
 /* AMD-V MSRs */
 
 #define MSR_VM_CR                       0xc0010114
+#define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
 #endif /* _ASM_X86_MSR_INDEX_H */
index 11be5ad..272514c 100644 (file)
@@ -55,6 +55,7 @@
 #define SECONDARY_EXEC_ENABLE_EPT               0x00000002
 #define SECONDARY_EXEC_ENABLE_VPID              0x00000020
 #define SECONDARY_EXEC_WBINVD_EXITING          0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST      0x00000080
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR         0
 #define VMX_EPT_EXTENT_CONTEXT                 1
 #define VMX_EPT_EXTENT_GLOBAL                  2
+
+#define VMX_EPT_EXECUTE_ONLY_BIT               (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT                        (1ull << 6)
+#define VMX_EPTP_UC_BIT                                (1ull << 8)
+#define VMX_EPTP_WB_BIT                                (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT                   (1ull << 16)
 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT          (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT             (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT              (1ull << 26)
+
 #define VMX_EPT_DEFAULT_GAW                    3
 #define VMX_EPT_MAX_GAW                                0x4
 #define VMX_EPT_MT_EPTE_SHIFT                  3
index c664d51..63b0ec8 100644 (file)
@@ -34,7 +34,6 @@
 struct kvm_para_state {
        u8 mmu_queue[MMU_QUEUE_SIZE];
        int mmu_queue_len;
-       enum paravirt_lazy_mode mode;
 };
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
 {
        struct kvm_para_state *state = kvm_para_state();
 
-       if (state->mode != PARAVIRT_LAZY_MMU) {
+       if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
                kvm_mmu_op(buffer, len);
                return;
        }
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
 
 static void kvm_enter_lazy_mmu(void)
 {
-       struct kvm_para_state *state = kvm_para_state();
-
        paravirt_enter_lazy_mmu();
-       state->mode = paravirt_get_lazy_mode();
 }
 
 static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
 
        mmu_queue_flush(state);
        paravirt_leave_lazy_mmu();
-       state->mode = paravirt_get_lazy_mode();
 }
 
 static void __init paravirt_ops_setup(void)
index 223af43..e5efcdc 100644 (file)
@@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void)
        struct timespec ts;
        int low, high;
 
-       low = (int)__pa(&wall_clock);
-       high = ((u64)__pa(&wall_clock) >> 32);
+       low = (int)__pa_symbol(&wall_clock);
+       high = ((u64)__pa_symbol(&wall_clock) >> 32);
        native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
 
        vcpu_time = &get_cpu_var(hv_clock);
index 8600a09..b84e571 100644 (file)
@@ -1,12 +1,8 @@
 #
 # KVM configuration
 #
-config HAVE_KVM
-       bool
 
-config HAVE_KVM_IRQCHIP
-       bool
-       default y
+source "virt/kvm/Kconfig"
 
 menuconfig VIRTUALIZATION
        bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
        select PREEMPT_NOTIFIERS
        select MMU_NOTIFIER
        select ANON_INODES
+       select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_EVENTFD
+       select KVM_APIC_ARCHITECTURE
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
@@ -63,18 +62,6 @@ config KVM_AMD
          To compile this as a module, choose M here: the module
          will be called kvm-amd.
 
-config KVM_TRACE
-       bool "KVM trace support"
-       depends on KVM && SYSFS
-       select MARKERS
-       select RELAY
-       select DEBUG_FS
-       default n
-       ---help---
-         This option allows reading a trace of kvm-related events through
-         relayfs.  Note the ABI is not considered stable and will be
-         modified in future updates.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/lguest/Kconfig
index b43c4ef..0e7fe78 100644 (file)
@@ -1,22 +1,19 @@
-#
-# Makefile for Kernel-based Virtual Machine module
-#
-
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o irq_comm.o)
-ifeq ($(CONFIG_KVM_TRACE),y)
-common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
-endif
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
-endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
-       i8254.o timer.o
-obj-$(CONFIG_KVM) += kvm.o
-kvm-intel-objs = vmx.o
-obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
-kvm-amd-objs = svm.o
-obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+kvm-y                  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
+                               coalesced_mmio.o irq_comm.o eventfd.o)
+kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
+
+kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+                          i8254.o timer.o
+kvm-intel-y            += vmx.o
+kvm-amd-y              += svm.o
+
+obj-$(CONFIG_KVM)      += kvm.o
+obj-$(CONFIG_KVM_INTEL)        += kvm-intel.o
+obj-$(CONFIG_KVM_AMD)  += kvm-amd.o
similarity index 90%
rename from arch/x86/kvm/x86_emulate.c
rename to arch/x86/kvm/emulate.c
index 616de46..1be5cd6 100644 (file)
@@ -1,5 +1,5 @@
 /******************************************************************************
- * x86_emulate.c
+ * emulate.c
  *
  * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
  *
@@ -30,7 +30,9 @@
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include <linux/module.h>
-#include <asm/kvm_x86_emulate.h>
+#include <asm/kvm_emulate.h>
+
+#include "mmu.h"               /* for is_long_mode() */
 
 /*
  * Opcode effective-address decode tables.
@@ -60,6 +62,7 @@
 #define SrcImmByte  (6<<4)     /* 8-bit sign-extended immediate operand. */
 #define SrcOne      (7<<4)     /* Implied '1' */
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
+#define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = {
        /* 0x10 - 0x17 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
        /* 0x18 - 0x1F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
        /* 0x20 - 0x27 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
        ByteOp | SrcImmUByte, SrcImmUByte,
        /* 0xE8 - 0xEF */
        SrcImm | Stack, SrcImm | ImplicitOps,
-       SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
+       SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
        SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        /* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
 
 static u32 twobyte_table[256] = {
        /* 0x00 - 0x0F */
-       0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
+       0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
        ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
        /* 0x10 - 0x1F */
        0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
        ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0,
        /* 0x30 - 0x3F */
-       ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       ImplicitOps, 0, ImplicitOps, 0,
+       ImplicitOps, ImplicitOps, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
        /* 0x40 - 0x47 */
        DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
        DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
 };
 
 /* EFLAGS bit definitions. */
+#define EFLG_VM (1<<17)
+#define EFLG_RF (1<<16)
 #define EFLG_OF (1<<11)
 #define EFLG_DF (1<<10)
+#define EFLG_IF (1<<9)
 #define EFLG_SF (1<<7)
 #define EFLG_ZF (1<<6)
 #define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
                c->src.type = OP_MEM;
                break;
        case SrcImm:
+       case SrcImmU:
                c->src.type = OP_IMM;
                c->src.ptr = (unsigned long *)c->eip;
                c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
                        c->src.val = insn_fetch(s32, 4, c->eip);
                        break;
                }
+               if ((c->d & SrcMask) == SrcImmU) {
+                       switch (c->src.bytes) {
+                       case 1:
+                               c->src.val &= 0xff;
+                               break;
+                       case 2:
+                               c->src.val &= 0xffff;
+                               break;
+                       case 4:
+                               c->src.val &= 0xffffffff;
+                               break;
+                       }
+               }
                break;
        case SrcImmByte:
        case SrcImmUByte:
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
                ctxt->interruptibility = mask;
 }
 
+static inline void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+       struct kvm_segment *cs, struct kvm_segment *ss)
+{
+       memset(cs, 0, sizeof(struct kvm_segment));
+       kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
+       memset(ss, 0, sizeof(struct kvm_segment));
+
+       cs->l = 0;              /* will be adjusted later */
+       cs->base = 0;           /* flat segment */
+       cs->g = 1;              /* 4kb granularity */
+       cs->limit = 0xffffffff; /* 4GB limit */
+       cs->type = 0x0b;        /* Read, Execute, Accessed */
+       cs->s = 1;
+       cs->dpl = 0;            /* will be adjusted later */
+       cs->present = 1;
+       cs->db = 1;
+
+       ss->unusable = 0;
+       ss->base = 0;           /* flat segment */
+       ss->limit = 0xffffffff; /* 4GB limit */
+       ss->g = 1;              /* 4kb granularity */
+       ss->s = 1;
+       ss->type = 0x03;        /* Read/Write, Accessed */
+       ss->db = 1;             /* 32bit stack segment */
+       ss->dpl = 0;
+       ss->present = 1;
+}
+
+static int
+emulate_syscall(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       struct kvm_segment cs, ss;
+       u64 msr_data;
+
+       /* syscall is not available in real mode */
+       if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
+               || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
+               return -1;
+
+       setup_syscalls_segments(ctxt, &cs, &ss);
+
+       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+       msr_data >>= 32;
+       cs.selector = (u16)(msr_data & 0xfffc);
+       ss.selector = (u16)(msr_data + 8);
+
+       if (is_long_mode(ctxt->vcpu)) {
+               cs.db = 0;
+               cs.l = 1;
+       }
+       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+       c->regs[VCPU_REGS_RCX] = c->eip;
+       if (is_long_mode(ctxt->vcpu)) {
+#ifdef CONFIG_X86_64
+               c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+
+               kvm_x86_ops->get_msr(ctxt->vcpu,
+                       ctxt->mode == X86EMUL_MODE_PROT64 ?
+                       MSR_LSTAR : MSR_CSTAR, &msr_data);
+               c->eip = msr_data;
+
+               kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+               ctxt->eflags &= ~(msr_data | EFLG_RF);
+#endif
+       } else {
+               /* legacy mode */
+               kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+               c->eip = (u32)msr_data;
+
+               ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+       }
+
+       return 0;
+}
+
+static int
+emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       struct kvm_segment cs, ss;
+       u64 msr_data;
+
+       /* inject #UD if LOCK prefix is used */
+       if (c->lock_prefix)
+               return -1;
+
+       /* inject #GP if in real mode or paging is disabled */
+       if (ctxt->mode == X86EMUL_MODE_REAL ||
+               !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+               kvm_inject_gp(ctxt->vcpu, 0);
+               return -1;
+       }
+
+       /* XXX sysenter/sysexit have not been tested in 64bit mode.
+       * Therefore, we inject an #UD.
+       */
+       if (ctxt->mode == X86EMUL_MODE_PROT64)
+               return -1;
+
+       setup_syscalls_segments(ctxt, &cs, &ss);
+
+       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+       switch (ctxt->mode) {
+       case X86EMUL_MODE_PROT32:
+               if ((msr_data & 0xfffc) == 0x0) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       return -1;
+               }
+               break;
+       case X86EMUL_MODE_PROT64:
+               if (msr_data == 0x0) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       return -1;
+               }
+               break;
+       }
+
+       ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+       cs.selector = (u16)msr_data;
+       cs.selector &= ~SELECTOR_RPL_MASK;
+       ss.selector = cs.selector + 8;
+       ss.selector &= ~SELECTOR_RPL_MASK;
+       if (ctxt->mode == X86EMUL_MODE_PROT64
+               || is_long_mode(ctxt->vcpu)) {
+               cs.db = 0;
+               cs.l = 1;
+       }
+
+       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+       c->eip = msr_data;
+
+       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+       c->regs[VCPU_REGS_RSP] = msr_data;
+
+       return 0;
+}
+
+static int
+emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+       struct decode_cache *c = &ctxt->decode;
+       struct kvm_segment cs, ss;
+       u64 msr_data;
+       int usermode;
+
+       /* inject #UD if LOCK prefix is used */
+       if (c->lock_prefix)
+               return -1;
+
+       /* inject #GP if in real mode or paging is disabled */
+       if (ctxt->mode == X86EMUL_MODE_REAL
+               || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
+               kvm_inject_gp(ctxt->vcpu, 0);
+               return -1;
+       }
+
+       /* sysexit must be called from CPL 0 */
+       if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
+               kvm_inject_gp(ctxt->vcpu, 0);
+               return -1;
+       }
+
+       setup_syscalls_segments(ctxt, &cs, &ss);
+
+       if ((c->rex_prefix & 0x8) != 0x0)
+               usermode = X86EMUL_MODE_PROT64;
+       else
+               usermode = X86EMUL_MODE_PROT32;
+
+       cs.dpl = 3;
+       ss.dpl = 3;
+       kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+       switch (usermode) {
+       case X86EMUL_MODE_PROT32:
+               cs.selector = (u16)(msr_data + 16);
+               if ((msr_data & 0xfffc) == 0x0) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       return -1;
+               }
+               ss.selector = (u16)(msr_data + 24);
+               break;
+       case X86EMUL_MODE_PROT64:
+               cs.selector = (u16)(msr_data + 32);
+               if (msr_data == 0x0) {
+                       kvm_inject_gp(ctxt->vcpu, 0);
+                       return -1;
+               }
+               ss.selector = cs.selector + 8;
+               cs.db = 0;
+               cs.l = 1;
+               break;
+       }
+       cs.selector |= SELECTOR_RPL_MASK;
+       ss.selector |= SELECTOR_RPL_MASK;
+
+       kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
+       kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+
+       c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
+       c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+
+       return 0;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -1970,6 +2203,12 @@ twobyte_insn:
                        goto cannot_emulate;
                }
                break;
+       case 0x05:              /* syscall */
+               if (emulate_syscall(ctxt) == -1)
+                       goto cannot_emulate;
+               else
+                       goto writeback;
+               break;
        case 0x06:
                emulate_clts(ctxt->vcpu);
                c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
                rc = X86EMUL_CONTINUE;
                c->dst.type = OP_NONE;
                break;
+       case 0x34:              /* sysenter */
+               if (emulate_sysenter(ctxt) == -1)
+                       goto cannot_emulate;
+               else
+                       goto writeback;
+               break;
+       case 0x35:              /* sysexit */
+               if (emulate_sysexit(ctxt) == -1)
+                       goto cannot_emulate;
+               else
+                       goto writeback;
+               break;
        case 0x40 ... 0x4f:     /* cmov */
                c->dst.val = c->dst.orig_val = c->src.val;
                if (!test_cc(c->b, ctxt->eflags))
index 21f68e0..82ad523 100644 (file)
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-       if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
+       if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
                return atomic_read(&pit->pit_state.pit_timer.pending);
        return 0;
 }
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
        struct hrtimer *timer;
 
-       if (vcpu->vcpu_id != 0 || !pit)
+       if (!kvm_vcpu_is_bsp(vcpu) || !pit)
                return;
 
        timer = &pit->pit_state.pit_timer.timer;
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
        pt->timer.function = kvm_timer_fn;
        pt->t_ops = &kpit_ops;
        pt->kvm = ps->pit->kvm;
-       pt->vcpu_id = 0;
+       pt->vcpu = pt->kvm->bsp_vcpu;
 
        atomic_set(&pt->pending, 0);
        ps->irq_ack = 1;
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
        case 1:
         /* FIXME: enhance mode 4 precision */
        case 4:
-               create_pit_timer(ps, val, 0);
+               if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
+                       create_pit_timer(ps, val, 0);
+               }
                break;
        case 2:
        case 3:
-               create_pit_timer(ps, val, 1);
+               if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
+                       create_pit_timer(ps, val, 1);
+               }
                break;
        default:
                destroy_pit_timer(&ps->pit_timer);
        }
 }
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+{
+       u8 saved_mode;
+       if (hpet_legacy_start) {
+               /* save existing mode for later reenablement */
+               saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
+               kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
+               pit_load_count(kvm, channel, val);
+               kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+       } else {
+               pit_load_count(kvm, channel, val);
+       }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+       return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
 {
-       mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       pit_load_count(kvm, channel, val);
-       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return container_of(dev, struct kvm_pit, speaker_dev);
 }
 
-static void pit_ioport_write(struct kvm_io_device *this,
-                            gpa_t addr, int len, const void *data)
+static inline int pit_in_range(gpa_t addr)
 {
-       struct kvm_pit *pit = (struct kvm_pit *)this->private;
+       return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+               (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_io_device *this,
+                           gpa_t addr, int len, const void *data)
+{
+       struct kvm_pit *pit = dev_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
        struct kvm *kvm = pit->kvm;
        int channel, access;
        struct kvm_kpit_channel_state *s;
        u32 val = *(u32 *) data;
+       if (!pit_in_range(addr))
+               return -EOPNOTSUPP;
 
        val  &= 0xff;
        addr &= KVM_PIT_CHANNEL_MASK;
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
        }
 
        mutex_unlock(&pit_state->lock);
+       return 0;
 }
 
-static void pit_ioport_read(struct kvm_io_device *this,
-                           gpa_t addr, int len, void *data)
+static int pit_ioport_read(struct kvm_io_device *this,
+                          gpa_t addr, int len, void *data)
 {
-       struct kvm_pit *pit = (struct kvm_pit *)this->private;
+       struct kvm_pit *pit = dev_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
        struct kvm *kvm = pit->kvm;
        int ret, count;
        struct kvm_kpit_channel_state *s;
+       if (!pit_in_range(addr))
+               return -EOPNOTSUPP;
 
        addr &= KVM_PIT_CHANNEL_MASK;
        s = &pit_state->channels[addr];
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
        memcpy(data, (char *)&ret, len);
 
        mutex_unlock(&pit_state->lock);
+       return 0;
 }
 
-static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
-                       int len, int is_write)
-{
-       return ((addr >= KVM_PIT_BASE_ADDRESS) &&
-               (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
-}
-
-static void speaker_ioport_write(struct kvm_io_device *this,
-                                gpa_t addr, int len, const void *data)
+static int speaker_ioport_write(struct kvm_io_device *this,
+                               gpa_t addr, int len, const void *data)
 {
-       struct kvm_pit *pit = (struct kvm_pit *)this->private;
+       struct kvm_pit *pit = speaker_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
        struct kvm *kvm = pit->kvm;
        u32 val = *(u32 *) data;
+       if (addr != KVM_SPEAKER_BASE_ADDRESS)
+               return -EOPNOTSUPP;
 
        mutex_lock(&pit_state->lock);
        pit_state->speaker_data_on = (val >> 1) & 1;
        pit_set_gate(kvm, 2, val & 1);
        mutex_unlock(&pit_state->lock);
+       return 0;
 }
 
-static void speaker_ioport_read(struct kvm_io_device *this,
-                               gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_io_device *this,
+                              gpa_t addr, int len, void *data)
 {
-       struct kvm_pit *pit = (struct kvm_pit *)this->private;
+       struct kvm_pit *pit = speaker_to_pit(this);
        struct kvm_kpit_state *pit_state = &pit->pit_state;
        struct kvm *kvm = pit->kvm;
        unsigned int refresh_clock;
        int ret;
+       if (addr != KVM_SPEAKER_BASE_ADDRESS)
+               return -EOPNOTSUPP;
 
        /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
        refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
                len = sizeof(ret);
        memcpy(data, (char *)&ret, len);
        mutex_unlock(&pit_state->lock);
-}
-
-static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
-                           int len, int is_write)
-{
-       return (addr == KVM_SPEAKER_BASE_ADDRESS);
+       return 0;
 }
 
 void kvm_pit_reset(struct kvm_pit *pit)
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
        struct kvm_kpit_channel_state *c;
 
        mutex_lock(&pit->pit_state.lock);
+       pit->pit_state.flags = 0;
        for (i = 0; i < 3; i++) {
                c = &pit->pit_state.channels[i];
                c->mode = 0xff;
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
        }
 }
 
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+static const struct kvm_io_device_ops pit_dev_ops = {
+       .read     = pit_ioport_read,
+       .write    = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+       .read     = speaker_ioport_read,
+       .write    = speaker_ioport_write,
+};
+
+/* Caller must have writers lock on slots_lock */
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
        struct kvm_pit *pit;
        struct kvm_kpit_state *pit_state;
+       int ret;
 
        pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
        if (!pit)
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
        mutex_lock(&pit->pit_state.lock);
        spin_lock_init(&pit->pit_state.inject_lock);
 
-       /* Initialize PIO device */
-       pit->dev.read = pit_ioport_read;
-       pit->dev.write = pit_ioport_write;
-       pit->dev.in_range = pit_in_range;
-       pit->dev.private = pit;
-       kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
-
-       pit->speaker_dev.read = speaker_ioport_read;
-       pit->speaker_dev.write = speaker_ioport_write;
-       pit->speaker_dev.in_range = speaker_in_range;
-       pit->speaker_dev.private = pit;
-       kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
-
        kvm->arch.vpit = pit;
        pit->kvm = kvm;
 
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
        pit->mask_notifier.func = pit_mask_notifer;
        kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
 
+       kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+       ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+       if (ret < 0)
+               goto fail;
+
+       if (flags & KVM_PIT_SPEAKER_DUMMY) {
+               kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+               ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
+                                               &pit->speaker_dev);
+               if (ret < 0)
+                       goto fail_unregister;
+       }
+
        return pit;
+
+fail_unregister:
+       __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
+
+fail:
+       if (pit->irq_source_id >= 0)
+               kvm_free_irq_source_id(kvm, pit->irq_source_id);
+
+       kfree(pit);
+       return NULL;
 }
 
 void kvm_free_pit(struct kvm *kvm)
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
        if (kvm->arch.vpit) {
                kvm_unregister_irq_mask_notifier(kvm, 0,
                                               &kvm->arch.vpit->mask_notifier);
+               kvm_unregister_irq_ack_notifier(kvm,
+                               &kvm->arch.vpit->pit_state.irq_ack_notifier);
                mutex_lock(&kvm->arch.vpit->pit_state.lock);
                timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
                hrtimer_cancel(timer);
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
        struct kvm_vcpu *vcpu;
        int i;
 
-       mutex_lock(&kvm->lock);
+       mutex_lock(&kvm->irq_lock);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-       mutex_unlock(&kvm->lock);
+       mutex_unlock(&kvm->irq_lock);
 
        /*
         * Provides NMI watchdog support via Virtual Wire mode.
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
         * VCPU0, and only if its LVT0 is in EXTINT mode.
         */
        if (kvm->arch.vapics_in_nmi_mode > 0)
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (vcpu)
-                               kvm_apic_nmi_wd_deliver(vcpu);
-               }
+               kvm_for_each_vcpu(i, vcpu, kvm)
+                       kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
        struct kvm *kvm = vcpu->kvm;
        struct kvm_kpit_state *ps;
 
-       if (vcpu && pit) {
+       if (pit) {
                int inject = 0;
                ps = &pit->pit_state;
 
index bbd863f..d4c1c7f 100644 (file)
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
 
 struct kvm_kpit_state {
        struct kvm_kpit_channel_state channels[3];
+       u32 flags;
        struct kvm_timer pit_timer;
        bool is_periodic;
        u32    speaker_data_on;
@@ -49,8 +50,8 @@ struct kvm_pit {
 #define KVM_PIT_CHANNEL_MASK       0x3
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
 void kvm_pit_reset(struct kvm_pit *pit);
 
index 1ccb50c..01f1516 100644 (file)
 #include "irq.h"
 
 #include <linux/kvm_host.h>
-
-static void pic_lock(struct kvm_pic *s)
-       __acquires(&s->lock)
-{
-       spin_lock(&s->lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
-       __releases(&s->lock)
-{
-       struct kvm *kvm = s->kvm;
-       unsigned acks = s->pending_acks;
-       bool wakeup = s->wakeup_needed;
-       struct kvm_vcpu *vcpu;
-
-       s->pending_acks = 0;
-       s->wakeup_needed = false;
-
-       spin_unlock(&s->lock);
-
-       while (acks) {
-               kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
-                                    __ffs(acks));
-               acks &= acks - 1;
-       }
-
-       if (wakeup) {
-               vcpu = s->kvm->vcpus[0];
-               if (vcpu)
-                       kvm_vcpu_kick(vcpu);
-       }
-}
+#include "trace.h"
 
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
        s->isr &= ~(1 << irq);
        s->isr_ack |= (1 << irq);
+       if (s != &s->pics_state->pics[0])
+               irq += 8;
+       kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
 {
        struct kvm_pic *s = pic_irqchip(kvm);
+       spin_lock(&s->lock);
        s->pics[0].isr_ack = 0xff;
        s->pics[1].isr_ack = 0xff;
+       spin_unlock(&s->lock);
 }
 
 /*
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
-       pic_lock(s);
+       spin_lock(&s->lock);
        pic_update_irq(s);
-       pic_unlock(s);
+       spin_unlock(&s->lock);
 }
 
 int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
        struct kvm_pic *s = opaque;
        int ret = -1;
 
-       pic_lock(s);
+       spin_lock(&s->lock);
        if (irq >= 0 && irq < PIC_NUM_PINS) {
                ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
                pic_update_irq(s);
+               trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+                                     s->pics[irq >> 3].imr, ret == 0);
        }
-       pic_unlock(s);
+       spin_unlock(&s->lock);
 
        return ret;
 }
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
        int irq, irq2, intno;
        struct kvm_pic *s = pic_irqchip(kvm);
 
-       pic_lock(s);
+       spin_lock(&s->lock);
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
                pic_intack(&s->pics[0], irq);
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
                intno = s->pics[0].irq_base + irq;
        }
        pic_update_irq(s);
-       pic_unlock(s);
-       kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
+       spin_unlock(&s->lock);
 
        return intno;
 }
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 {
        int irq, irqbase, n;
        struct kvm *kvm = s->pics_state->irq_request_opaque;
-       struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+       struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
 
        if (s == &s->pics_state->pics[0])
                irqbase = 0;
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
                if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
                        if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
                                n = irq + irqbase;
-                               s->pics_state->pending_acks |= 1 << n;
+                               kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
                        }
        }
        s->last_irr = 0;
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
        return s->elcr;
 }
 
-static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
-                          int len, int is_write)
+static int picdev_in_range(gpa_t addr)
 {
        switch (addr) {
        case 0x20:
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
        }
 }
 
-static void picdev_write(struct kvm_io_device *this,
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+       return container_of(dev, struct kvm_pic, dev);
+}
+
+static int picdev_write(struct kvm_io_device *this,
                         gpa_t addr, int len, const void *val)
 {
-       struct kvm_pic *s = this->private;
+       struct kvm_pic *s = to_pic(this);
        unsigned char data = *(unsigned char *)val;
+       if (!picdev_in_range(addr))
+               return -EOPNOTSUPP;
 
        if (len != 1) {
                if (printk_ratelimit())
                        printk(KERN_ERR "PIC: non byte write\n");
-               return;
+               return 0;
        }
-       pic_lock(s);
+       spin_lock(&s->lock);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this,
                elcr_ioport_write(&s->pics[addr & 1], addr, data);
                break;
        }
-       pic_unlock(s);
+       spin_unlock(&s->lock);
+       return 0;
 }
 
-static void picdev_read(struct kvm_io_device *this,
-                       gpa_t addr, int len, void *val)
+static int picdev_read(struct kvm_io_device *this,
+                      gpa_t addr, int len, void *val)
 {
-       struct kvm_pic *s = this->private;
+       struct kvm_pic *s = to_pic(this);
        unsigned char data = 0;
+       if (!picdev_in_range(addr))
+               return -EOPNOTSUPP;
 
        if (len != 1) {
                if (printk_ratelimit())
                        printk(KERN_ERR "PIC: non byte read\n");
-               return;
+               return 0;
        }
-       pic_lock(s);
+       spin_lock(&s->lock);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this,
                break;
        }
        *(unsigned char *)val = data;
-       pic_unlock(s);
+       spin_unlock(&s->lock);
+       return 0;
 }
 
 /*
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this,
 static void pic_irq_request(void *opaque, int level)
 {
        struct kvm *kvm = opaque;
-       struct kvm_vcpu *vcpu = kvm->vcpus[0];
+       struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
        struct kvm_pic *s = pic_irqchip(kvm);
        int irq = pic_get_irq(&s->pics[0]);
 
        s->output = level;
        if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
                s->pics[0].isr_ack &= ~(1 << irq);
-               s->wakeup_needed = true;
+               kvm_vcpu_kick(vcpu);
        }
 }
 
+static const struct kvm_io_device_ops picdev_ops = {
+       .read     = picdev_read,
+       .write    = picdev_write,
+};
+
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 {
        struct kvm_pic *s;
+       int ret;
+
        s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
        if (!s)
                return NULL;
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        /*
         * Initialize PIO device
         */
-       s->dev.read = picdev_read;
-       s->dev.write = picdev_write;
-       s->dev.in_range = picdev_in_range;
-       s->dev.private = s;
-       kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
+       kvm_iodevice_init(&s->dev, &picdev_ops);
+       ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
+       if (ret < 0) {
+               kfree(s);
+               return NULL;
+       }
+
        return s;
 }
index 9f59318..7d6058a 100644 (file)
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
 
 struct kvm_pic {
        spinlock_t lock;
-       bool wakeup_needed;
        unsigned pending_acks;
        struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
index 1ff819d..7bcc5b6 100644 (file)
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
        kvm_register_write(vcpu, VCPU_REGS_RIP, val);
 }
 
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+       if (!test_bit(VCPU_EXREG_PDPTR,
+                     (unsigned long *)&vcpu->arch.regs_avail))
+               kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+       return vcpu->arch.pdptrs[index];
+}
+
 #endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644 (file)
index ed66e4c..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <asm/msr.h>
-
-#include <asm/svm.h>
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-       MSR_FS_BASE,
-#endif
-       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
-       struct kvm_vcpu vcpu;
-       struct vmcb *vmcb;
-       unsigned long vmcb_pa;
-       struct svm_cpu_data *svm_data;
-       uint64_t asid_generation;
-
-       u64 next_rip;
-
-       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-       u64 host_gs_base;
-       unsigned long host_cr2;
-
-       u32 *msrpm;
-       struct vmcb *hsave;
-       u64 hsave_msr;
-
-       u64 nested_vmcb;
-
-       /* These are the merged vectors */
-       u32 *nested_msrpm;
-
-       /* gpa pointers to the real vectors */
-       u64 nested_vmcb_msrpm;
-};
-
-#endif
-
index 26bd6ba..55c7524 100644 (file)
@@ -6,7 +6,7 @@ struct kvm_timer {
        bool reinject;
        struct kvm_timer_ops *t_ops;
        struct kvm *kvm;
-       int vcpu_id;
+       struct kvm_vcpu *vcpu;
 };
 
 struct kvm_timer_ops {
index ae99d83..1ae5ceb 100644 (file)
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
+#include <asm/apicdef.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
+#include "trace.h"
+#include "x86.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
        return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
 }
 
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       struct kvm_cpuid_entry2 *feat;
+       u32 v = APIC_VERSION;
+
+       if (!irqchip_in_kernel(vcpu->kvm))
+               return;
+
+       feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+       if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+               v |= APIC_LVR_DIRECTED_EOI;
+       apic_set_reg(apic, APIC_LVR, v);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+       return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
        LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap)
 
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
+       apic->irr_pending = true;
        return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
 }
 
-static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+static inline int apic_search_irr(struct kvm_lapic *apic)
 {
-       apic_clear_vector(vec, apic->regs + APIC_IRR);
+       return find_highest_vector(apic->regs + APIC_IRR);
 }
 
 static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 {
        int result;
 
-       result = find_highest_vector(apic->regs + APIC_IRR);
+       if (!apic->irr_pending)
+               return -1;
+
+       result = apic_search_irr(apic);
        ASSERT(result == -1 || result >= 16);
 
        return result;
 }
 
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+       apic->irr_pending = false;
+       apic_clear_vector(vec, apic->regs + APIC_IRR);
+       if (apic_search_irr(apic) != -1)
+               apic->irr_pending = true;
+}
+
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
        int highest_irr;
 
+       /* This may race with setting of irr in __apic_accept_irq() and
+        * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+        * will cause vmexit immediately and the value will be recalculated
+        * on the next vmentry.
+        */
        if (!apic)
                return 0;
        highest_irr = apic_find_highest_irr(apic);
 
        return highest_irr;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode);
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 {
        int result = 0;
-       u8 logical_id;
+       u32 logical_id;
+
+       if (apic_x2apic_mode(apic)) {
+               logical_id = apic_get_reg(apic, APIC_LDR);
+               return logical_id & mda;
+       }
 
        logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
 
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                        break;
 
                result = !apic_test_and_set_irr(vector, apic);
+               trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+                                         trig_mode, vector, !result);
                if (!result) {
                        if (trig_mode)
                                apic_debug("level trig mode repeatedly for "
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic)
                trigger_mode = IOAPIC_LEVEL_TRIG;
        else
                trigger_mode = IOAPIC_EDGE_TRIG;
-       kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+       if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
+               mutex_lock(&apic->vcpu->kvm->irq_lock);
+               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+               mutex_unlock(&apic->vcpu->kvm->irq_lock);
+       }
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic)
        irq.level = icr_low & APIC_INT_ASSERT;
        irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
        irq.shorthand = icr_low & APIC_SHORT_MASK;
-       irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+       if (apic_x2apic_mode(apic))
+               irq.dest_id = icr_high;
+       else
+               irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+       trace_kvm_apic_ipi(icr_low, irq.dest_id);
 
        apic_debug("icr_high 0x%x, icr_low 0x%x, "
                   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
                   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
                   irq.vector);
 
+       mutex_lock(&apic->vcpu->kvm->irq_lock);
        kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+       mutex_unlock(&apic->vcpu->kvm->irq_lock);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 {
        u32 val = 0;
 
-       KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
-
        if (offset >= LAPIC_MMIO_LENGTH)
                return 0;
 
        switch (offset) {
+       case APIC_ID:
+               if (apic_x2apic_mode(apic))
+                       val = kvm_apic_id(apic);
+               else
+                       val = kvm_apic_id(apic) << 24;
+               break;
        case APIC_ARBPRI:
                printk(KERN_WARNING "Access APIC ARBPRI register "
                       "which is for P6\n");
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
        return val;
 }
 
-static void apic_mmio_read(struct kvm_io_device *this,
-                          gpa_t address, int len, void *data)
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+       return container_of(dev, struct kvm_lapic, dev);
+}
+
+static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+               void *data)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       unsigned int offset = address - apic->base_address;
        unsigned char alignment = offset & 0xf;
        u32 result;
+       /* this bitmask has a bit cleared for each reserver register */
+       static const u64 rmask = 0x43ff01ffffffe70cULL;
 
        if ((alignment + len) > 4) {
-               printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
-                      (unsigned long)address, len);
-               return;
+               apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+                          offset, len);
+               return 1;
        }
+
+       if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+               apic_debug("KVM_APIC_READ: read reserved register %x\n",
+                          offset);
+               return 1;
+       }
+
        result = __apic_read(apic, offset & ~0xf);
 
+       trace_kvm_apic_read(offset, result);
+
        switch (len) {
        case 1:
        case 2:
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this,
                       "should be 1,2, or 4 instead\n", len);
                break;
        }
+       return 0;
+}
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+       return apic_hw_enabled(apic) &&
+           addr >= apic->base_address &&
+           addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_io_device *this,
+                          gpa_t address, int len, void *data)
+{
+       struct kvm_lapic *apic = to_lapic(this);
+       u32 offset = address - apic->base_address;
+
+       if (!apic_mmio_in_range(apic, address))
+               return -EOPNOTSUPP;
+
+       apic_reg_read(apic, offset, len, data);
+
+       return 0;
 }
 
 static void update_divide_count(struct kvm_lapic *apic)
@@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic)
 
        if (!apic->lapic_timer.period)
                return;
+       /*
+        * Do not allow the guest to program periodic timers with small
+        * interval, since the hrtimers are not throttled by the host
+        * scheduler.
+        */
+       if (apic_lvtt_period(apic)) {
+               if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+                       apic->lapic_timer.period = NSEC_PER_MSEC/2;
+       }
 
        hrtimer_start(&apic->lapic_timer.timer,
                      ktime_add_ns(now, apic->lapic_timer.period),
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
                apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
 }
 
-static void apic_mmio_write(struct kvm_io_device *this,
-                           gpa_t address, int len, const void *data)
+static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       unsigned int offset = address - apic->base_address;
-       unsigned char alignment = offset & 0xf;
-       u32 val;
-
-       /*
-        * APIC register must be aligned on 128-bits boundary.
-        * 32/64/128 bits registers must be accessed thru 32 bits.
-        * Refer SDM 8.4.1
-        */
-       if (len != 4 || alignment) {
-               /* Don't shout loud, $infamous_os would cause only noise. */
-               apic_debug("apic write: bad size=%d %lx\n",
-                          len, (long)address);
-               return;
-       }
-
-       val = *(u32 *) data;
-
-       /* too common printing */
-       if (offset != APIC_EOI)
-               apic_debug("%s: offset 0x%x with length 0x%x, and value is "
-                          "0x%x\n", __func__, offset, len, val);
-
-       offset &= 0xff0;
+       int ret = 0;
 
-       KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
+       trace_kvm_apic_write(reg, val);
 
-       switch (offset) {
+       switch (reg) {
        case APIC_ID:           /* Local APIC ID */
-               apic_set_reg(apic, APIC_ID, val);
+               if (!apic_x2apic_mode(apic))
+                       apic_set_reg(apic, APIC_ID, val);
+               else
+                       ret = 1;
                break;
 
        case APIC_TASKPRI:
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this,
                break;
 
        case APIC_LDR:
-               apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+               if (!apic_x2apic_mode(apic))
+                       apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+               else
+                       ret = 1;
                break;
 
        case APIC_DFR:
-               apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+               if (!apic_x2apic_mode(apic))
+                       apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+               else
+                       ret = 1;
                break;
 
-       case APIC_SPIV:
-               apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
+       case APIC_SPIV: {
+               u32 mask = 0x3ff;
+               if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+                       mask |= APIC_SPIV_DIRECTED_EOI;
+               apic_set_reg(apic, APIC_SPIV, val & mask);
                if (!(val & APIC_SPIV_APIC_ENABLED)) {
                        int i;
                        u32 lvt_val;
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 
                }
                break;
-
+       }
        case APIC_ICR:
                /* No delay here, so we always clear the pending bit */
                apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
                break;
 
        case APIC_ICR2:
-               apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
+               if (!apic_x2apic_mode(apic))
+                       val &= 0xff000000;
+               apic_set_reg(apic, APIC_ICR2, val);
                break;
 
        case APIC_LVT0:
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
                if (!apic_sw_enabled(apic))
                        val |= APIC_LVT_MASKED;
 
-               val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
-               apic_set_reg(apic, offset, val);
+               val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+               apic_set_reg(apic, reg, val);
 
                break;
 
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
                hrtimer_cancel(&apic->lapic_timer.timer);
                apic_set_reg(apic, APIC_TMICT, val);
                start_apic_timer(apic);
-               return;
+               break;
 
        case APIC_TDCR:
                if (val & 4)
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this,
                update_divide_count(apic);
                break;
 
+       case APIC_ESR:
+               if (apic_x2apic_mode(apic) && val != 0) {
+                       printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+                       ret = 1;
+               }
+               break;
+
+       case APIC_SELF_IPI:
+               if (apic_x2apic_mode(apic)) {
+                       apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+               } else
+                       ret = 1;
+               break;
        default:
-               apic_debug("Local APIC Write to read-only register %x\n",
-                          offset);
+               ret = 1;
                break;
        }
-
+       if (ret)
+               apic_debug("Local APIC Write to read-only register %x\n", reg);
+       return ret;
 }
 
-static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
-                          int len, int size)
+static int apic_mmio_write(struct kvm_io_device *this,
+                           gpa_t address, int len, const void *data)
 {
-       struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
-       int ret = 0;
+       struct kvm_lapic *apic = to_lapic(this);
+       unsigned int offset = address - apic->base_address;
+       u32 val;
 
+       if (!apic_mmio_in_range(apic, address))
+               return -EOPNOTSUPP;
 
-       if (apic_hw_enabled(apic) &&
-           (addr >= apic->base_address) &&
-           (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
-               ret = 1;
+       /*
+        * APIC register must be aligned on 128-bits boundary.
+        * 32/64/128 bits registers must be accessed thru 32 bits.
+        * Refer SDM 8.4.1
+        */
+       if (len != 4 || (offset & 0xf)) {
+               /* Don't shout loud, $infamous_os would cause only noise. */
+               apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+               return 0;
+       }
 
-       return ret;
+       val = *(u32*)data;
+
+       /* too common printing */
+       if (offset != APIC_EOI)
+               apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+                          "0x%x\n", __func__, offset, len, val);
+
+       apic_reg_write(apic, offset & 0xff0, val);
+
+       return 0;
 }
 
 void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
        apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
                     | (apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
 
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 
        return (tpr & 0xf0) >> 4;
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
 
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 {
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
                vcpu->arch.apic_base = value;
                return;
        }
-       if (apic->vcpu->vcpu_id)
+
+       if (!kvm_vcpu_is_bsp(apic->vcpu))
                value &= ~MSR_IA32_APICBASE_BSP;
 
        vcpu->arch.apic_base = value;
+       if (apic_x2apic_mode(apic)) {
+               u32 id = kvm_apic_id(apic);
+               u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
+               apic_set_reg(apic, APIC_LDR, ldr);
+       }
        apic->base_address = apic->vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
 
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 }
 
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
-{
-       return vcpu->arch.apic_base;
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
-
 void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic;
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
        hrtimer_cancel(&apic->lapic_timer.timer);
 
        apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
-       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+       kvm_apic_set_version(apic->vcpu);
 
        for (i = 0; i < APIC_LVT_NUM; i++)
                apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
                apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
                apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
        }
+       apic->irr_pending = false;
        update_divide_count(apic);
        atomic_set(&apic->lapic_timer.pending, 0);
-       if (vcpu->vcpu_id == 0)
+       if (kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
        apic_update_ppr(apic);
 
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
                   vcpu, kvm_apic_id(apic),
                   vcpu->arch.apic_base, apic->base_address);
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
 bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
 {
        return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
 }
-EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
 
 /*
  *----------------------------------------------------------------------
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = {
        .is_periodic = lapic_is_periodic,
 };
 
+static const struct kvm_io_device_ops apic_mmio_ops = {
+       .read     = apic_mmio_read,
+       .write    = apic_mmio_write,
+};
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic;
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
        apic->lapic_timer.timer.function = kvm_timer_fn;
        apic->lapic_timer.t_ops = &lapic_timer_ops;
        apic->lapic_timer.kvm = vcpu->kvm;
-       apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+       apic->lapic_timer.vcpu = vcpu;
 
        apic->base_address = APIC_DEFAULT_PHYS_BASE;
        vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
        kvm_lapic_reset(vcpu);
-       apic->dev.read = apic_mmio_read;
-       apic->dev.write = apic_mmio_write;
-       apic->dev.in_range = apic_mmio_range;
-       apic->dev.private = apic;
+       kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
        return 0;
 nomem_free_apic:
@@ -962,7 +1088,6 @@ nomem_free_apic:
 nomem:
        return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(kvm_create_lapic);
 
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
        u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
        int r = 0;
 
-       if (vcpu->vcpu_id == 0) {
+       if (kvm_vcpu_is_bsp(vcpu)) {
                if (!apic_hw_enabled(vcpu->arch.apic))
                        r = 1;
                if ((lvt0 & APIC_LVT_MASKED) == 0 &&
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 
        apic->base_address = vcpu->arch.apic_base &
                             MSR_IA32_APICBASE_BASE;
-       apic_set_reg(apic, APIC_LVR, APIC_VERSION);
+       kvm_apic_set_version(vcpu);
+
        apic_update_ppr(apic);
        hrtimer_cancel(&apic->lapic_timer.timer);
        update_divide_count(apic);
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 
        vcpu->arch.apic->vapic_addr = vapic_addr;
 }
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+               return 1;
+
+       /* if this is ICR write vector before command */
+       if (msr == 0x830)
+               apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+       return apic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+       if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+               return 1;
+
+       if (apic_reg_read(apic, reg, 4, &low))
+               return 1;
+       if (msr == 0x830)
+               apic_reg_read(apic, APIC_ICR2, 4, &high);
+
+       *data = (((u64)high) << 32) | low;
+
+       return 0;
+}
index a587f83..40010b0 100644 (file)
@@ -12,6 +12,7 @@ struct kvm_lapic {
        struct kvm_timer lapic_timer;
        u32 divide_count;
        struct kvm_vcpu *vcpu;
+       bool irr_pending;
        struct page *regs_page;
        void *regs;
        gpa_t vapic_addr;
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
 
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 #endif
index 0ef5bb2..eca41ae 100644 (file)
@@ -18,6 +18,7 @@
  */
 
 #include "mmu.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
 
 #define PT32_LEVEL_MASK(level) \
                (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
+#define PT32_LVL_OFFSET_MASK(level) \
+       (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                               * PT32_LEVEL_BITS))) - 1))
 
 #define PT32_INDEX(address, level)\
        (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 #define PT64_DIR_BASE_ADDR_MASK \
        (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                               * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+       (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                               * PT64_LEVEL_BITS))) - 1))
 
 #define PT32_BASE_ADDR_MASK PAGE_MASK
 #define PT32_DIR_BASE_ADDR_MASK \
        (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+       (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                           * PT32_LEVEL_BITS))) - 1))
 
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
                        | PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
 #define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
 
+#define PT_PDPE_LEVEL 3
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
 
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
-       u64 *shadow_ptes[RMAP_EXT];
+       u64 *sptes[RMAP_EXT];
        struct kvm_rmap_desc *more;
 };
 
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte)
        return pte & PT_WRITABLE_MASK;
 }
 
-static int is_dirty_pte(unsigned long pte)
+static int is_dirty_gpte(unsigned long pte)
 {
-       return pte & shadow_dirty_mask;
+       return pte & PT_DIRTY_MASK;
 }
 
-static int is_rmap_pte(u64 pte)
+static int is_rmap_spte(u64 pte)
 {
        return is_shadow_present_pte(pte);
 }
 
+static int is_last_spte(u64 pte, int level)
+{
+       if (level == PT_PAGE_TABLE_LEVEL)
+               return 1;
+       if (is_large_pte(pte))
+               return 1;
+       return 0;
+}
+
 static pfn_t spte_to_pfn(u64 pte)
 {
        return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
        return (gpte & PT32_DIR_PSE36_MASK) << shift;
 }
 
-static void set_shadow_pte(u64 *sptep, u64 spte)
+static void __set_spte(u64 *sptep, u64 spte)
 {
 #ifdef CONFIG_X86_64
        set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
  * Return the pointer to the largepage write count for a given
  * gfn, handling slots that are not large page aligned.
  */
-static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+static int *slot_largepage_idx(gfn_t gfn,
+                              struct kvm_memory_slot *slot,
+                              int level)
 {
        unsigned long idx;
 
-       idx = (gfn / KVM_PAGES_PER_HPAGE) -
-             (slot->base_gfn / KVM_PAGES_PER_HPAGE);
-       return &slot->lpage_info[idx].write_count;
+       idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+             (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+       return &slot->lpage_info[level - 2][idx].write_count;
 }
 
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
+       struct kvm_memory_slot *slot;
        int *write_count;
+       int i;
 
        gfn = unalias_gfn(kvm, gfn);
-       write_count = slot_largepage_idx(gfn,
-                                        gfn_to_memslot_unaliased(kvm, gfn));
-       *write_count += 1;
+
+       slot = gfn_to_memslot_unaliased(kvm, gfn);
+       for (i = PT_DIRECTORY_LEVEL;
+            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+               write_count   = slot_largepage_idx(gfn, slot, i);
+               *write_count += 1;
+       }
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
+       struct kvm_memory_slot *slot;
        int *write_count;
+       int i;
 
        gfn = unalias_gfn(kvm, gfn);
-       write_count = slot_largepage_idx(gfn,
-                                        gfn_to_memslot_unaliased(kvm, gfn));
-       *write_count -= 1;
-       WARN_ON(*write_count < 0);
+       for (i = PT_DIRECTORY_LEVEL;
+            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+               slot          = gfn_to_memslot_unaliased(kvm, gfn);
+               write_count   = slot_largepage_idx(gfn, slot, i);
+               *write_count -= 1;
+               WARN_ON(*write_count < 0);
+       }
 }
 
-static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+static int has_wrprotected_page(struct kvm *kvm,
+                               gfn_t gfn,
+                               int level)
 {
        struct kvm_memory_slot *slot;
        int *largepage_idx;
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
        gfn = unalias_gfn(kvm, gfn);
        slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (slot) {
-               largepage_idx = slot_largepage_idx(gfn, slot);
+               largepage_idx = slot_largepage_idx(gfn, slot, level);
                return *largepage_idx;
        }
 
        return 1;
 }
 
-static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
+       unsigned long page_size = PAGE_SIZE;
        struct vm_area_struct *vma;
        unsigned long addr;
-       int ret = 0;
+       int i, ret = 0;
 
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
-               return ret;
+               return page_size;
 
        down_read(&current->mm->mmap_sem);
        vma = find_vma(current->mm, addr);
-       if (vma && is_vm_hugetlb_page(vma))
-               ret = 1;
+       if (!vma)
+               goto out;
+
+       page_size = vma_kernel_pagesize(vma);
+
+out:
        up_read(&current->mm->mmap_sem);
 
+       for (i = PT_PAGE_TABLE_LEVEL;
+            i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+               if (page_size >= KVM_HPAGE_SIZE(i))
+                       ret = i;
+               else
+                       break;
+       }
+
        return ret;
 }
 
-static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
        struct kvm_memory_slot *slot;
-
-       if (has_wrprotected_page(vcpu->kvm, large_gfn))
-               return 0;
-
-       if (!host_largepage_backed(vcpu->kvm, large_gfn))
-               return 0;
+       int host_level;
+       int level = PT_PAGE_TABLE_LEVEL;
 
        slot = gfn_to_memslot(vcpu->kvm, large_gfn);
        if (slot && slot->dirty_bitmap)
-               return 0;
+               return PT_PAGE_TABLE_LEVEL;
 
-       return 1;
+       host_level = host_mapping_level(vcpu->kvm, large_gfn);
+
+       if (host_level == PT_PAGE_TABLE_LEVEL)
+               return host_level;
+
+       for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
+
+               if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+                       break;
+       }
+
+       return level - 1;
 }
 
 /*
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
  * Note: gfn must be unaliased before this function get called
  */
 
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
        struct kvm_memory_slot *slot;
        unsigned long idx;
 
        slot = gfn_to_memslot(kvm, gfn);
-       if (!lpage)
+       if (likely(level == PT_PAGE_TABLE_LEVEL))
                return &slot->rmap[gfn - slot->base_gfn];
 
-       idx = (gfn / KVM_PAGES_PER_HPAGE) -
-             (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+       idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
+               (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
 
-       return &slot->lpage_info[idx].rmap_pde;
+       return &slot->lpage_info[level - 2][idx].rmap_pde;
 }
 
 /*
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
  * the spte was not added.
  *
  */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
        struct kvm_mmu_page *sp;
        struct kvm_rmap_desc *desc;
        unsigned long *rmapp;
        int i, count = 0;
 
-       if (!is_rmap_pte(*spte))
+       if (!is_rmap_spte(*spte))
                return count;
        gfn = unalias_gfn(vcpu->kvm, gfn);
        sp = page_header(__pa(spte));
        sp->gfns[spte - sp->spt] = gfn;
-       rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
        if (!*rmapp) {
                rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
                *rmapp = (unsigned long)spte;
        } else if (!(*rmapp & 1)) {
                rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
                desc = mmu_alloc_rmap_desc(vcpu);
-               desc->shadow_ptes[0] = (u64 *)*rmapp;
-               desc->shadow_ptes[1] = spte;
+               desc->sptes[0] = (u64 *)*rmapp;
+               desc->sptes[1] = spte;
                *rmapp = (unsigned long)desc | 1;
        } else {
                rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-               while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) {
+               while (desc->sptes[RMAP_EXT-1] && desc->more) {
                        desc = desc->more;
                        count += RMAP_EXT;
                }
-               if (desc->shadow_ptes[RMAP_EXT-1]) {
+               if (desc->sptes[RMAP_EXT-1]) {
                        desc->more = mmu_alloc_rmap_desc(vcpu);
                        desc = desc->more;
                }
-               for (i = 0; desc->shadow_ptes[i]; ++i)
+               for (i = 0; desc->sptes[i]; ++i)
                        ;
-               desc->shadow_ptes[i] = spte;
+               desc->sptes[i] = spte;
        }
        return count;
 }
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
 {
        int j;
 
-       for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+       for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
                ;
-       desc->shadow_ptes[i] = desc->shadow_ptes[j];
-       desc->shadow_ptes[j] = NULL;
+       desc->sptes[i] = desc->sptes[j];
+       desc->sptes[j] = NULL;
        if (j != 0)
                return;
        if (!prev_desc && !desc->more)
-               *rmapp = (unsigned long)desc->shadow_ptes[0];
+               *rmapp = (unsigned long)desc->sptes[0];
        else
                if (prev_desc)
                        prev_desc->more = desc->more;
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
        unsigned long *rmapp;
        int i;
 
-       if (!is_rmap_pte(*spte))
+       if (!is_rmap_spte(*spte))
                return;
        sp = page_header(__pa(spte));
        pfn = spte_to_pfn(*spte);
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
                kvm_release_pfn_dirty(pfn);
        else
                kvm_release_pfn_clean(pfn);
-       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
+       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
        if (!*rmapp) {
                printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
                BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
                desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
                prev_desc = NULL;
                while (desc) {
-                       for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
-                               if (desc->shadow_ptes[i] == spte) {
+                       for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+                               if (desc->sptes[i] == spte) {
                                        rmap_desc_remove_entry(rmapp,
                                                               desc, i,
                                                               prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
        prev_desc = NULL;
        prev_spte = NULL;
        while (desc) {
-               for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
+               for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
                        if (prev_spte == spte)
-                               return desc->shadow_ptes[i];
-                       prev_spte = desc->shadow_ptes[i];
+                               return desc->sptes[i];
+                       prev_spte = desc->sptes[i];
                }
                desc = desc->more;
        }
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        unsigned long *rmapp;
        u64 *spte;
-       int write_protected = 0;
+       int i, write_protected = 0;
 
        gfn = unalias_gfn(kvm, gfn);
-       rmapp = gfn_to_rmap(kvm, gfn, 0);
+       rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
        spte = rmap_next(kvm, rmapp, NULL);
        while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
                if (is_writeble_pte(*spte)) {
-                       set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
+                       __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
                        write_protected = 1;
                }
                spte = rmap_next(kvm, rmapp, spte);
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
        }
 
        /* check for huge page mappings */
-       rmapp = gfn_to_rmap(kvm, gfn, 1);
-       spte = rmap_next(kvm, rmapp, NULL);
-       while (spte) {
-               BUG_ON(!spte);
-               BUG_ON(!(*spte & PT_PRESENT_MASK));
-               BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
-               pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
-               if (is_writeble_pte(*spte)) {
-                       rmap_remove(kvm, spte);
-                       --kvm->stat.lpages;
-                       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
-                       spte = NULL;
-                       write_protected = 1;
+       for (i = PT_DIRECTORY_LEVEL;
+            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+               rmapp = gfn_to_rmap(kvm, gfn, i);
+               spte = rmap_next(kvm, rmapp, NULL);
+               while (spte) {
+                       BUG_ON(!spte);
+                       BUG_ON(!(*spte & PT_PRESENT_MASK));
+                       BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+                       pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+                       if (is_writeble_pte(*spte)) {
+                               rmap_remove(kvm, spte);
+                               --kvm->stat.lpages;
+                               __set_spte(spte, shadow_trap_nonpresent_pte);
+                               spte = NULL;
+                               write_protected = 1;
+                       }
+                       spte = rmap_next(kvm, rmapp, spte);
                }
-               spte = rmap_next(kvm, rmapp, spte);
        }
 
        return write_protected;
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
                BUG_ON(!(*spte & PT_PRESENT_MASK));
                rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
                rmap_remove(kvm, spte);
-               set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+               __set_spte(spte, shadow_trap_nonpresent_pte);
                need_tlb_flush = 1;
        }
        return need_tlb_flush;
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                          int (*handler)(struct kvm *kvm, unsigned long *rmapp))
 {
-       int i;
+       int i, j;
        int retval = 0;
 
        /*
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
                end = start + (memslot->npages << PAGE_SHIFT);
                if (hva >= start && hva < end) {
                        gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
                        retval |= handler(kvm, &memslot->rmap[gfn_offset]);
-                       retval |= handler(kvm,
-                                         &memslot->lpage_info[
-                                                 gfn_offset /
-                                                 KVM_PAGES_PER_HPAGE].rmap_pde);
+
+                       for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
+                               int idx = gfn_offset;
+                               idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+                               retval |= handler(kvm,
+                                       &memslot->lpage_info[j][idx].rmap_pde);
+                       }
                }
        }
 
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
 
 #define RMAP_RECYCLE_THRESHOLD 1000
 
-static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage)
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
        unsigned long *rmapp;
+       struct kvm_mmu_page *sp;
+
+       sp = page_header(__pa(spte));
 
        gfn = unalias_gfn(vcpu->kvm, gfn);
-       rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
+       rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
        kvm_unmap_rmapp(vcpu->kvm, rmapp);
        kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                return 1;
        }
 
+       trace_kvm_mmu_sync_page(sp);
        if (rmap_write_protect(vcpu->kvm, sp->gfn))
                kvm_flush_remote_tlbs(vcpu->kvm);
        kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       pgprintk("%s: looking gfn %lx role %x\n", __func__,
-                gfn, role.word);
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
                                kvm_mmu_mark_parents_unsync(vcpu, sp);
                        }
-                       pgprintk("%s: found\n", __func__);
+                       trace_kvm_mmu_get_page(sp, false);
                        return sp;
                }
        ++vcpu->kvm->stat.mmu_cache_miss;
        sp = kvm_mmu_alloc_page(vcpu, parent_pte);
        if (!sp)
                return sp;
-       pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
        sp->gfn = gfn;
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                vcpu->arch.mmu.prefetch_page(vcpu, sp);
        else
                nonpaging_prefetch_page(vcpu, sp);
+       trace_kvm_mmu_get_page(sp, true);
        return sp;
 }
 
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 {
        if (iterator->level < PT_PAGE_TABLE_LEVEL)
                return false;
+
+       if (iterator->level == PT_PAGE_TABLE_LEVEL)
+               if (is_large_pte(*iterator->sptep))
+                       return false;
+
        iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
        iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
        return true;
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 
        pt = sp->spt;
 
-       if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       if (is_shadow_present_pte(pt[i]))
-                               rmap_remove(kvm, &pt[i]);
-                       pt[i] = shadow_trap_nonpresent_pte;
-               }
-               return;
-       }
-
        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
                ent = pt[i];
 
                if (is_shadow_present_pte(ent)) {
-                       if (!is_large_pte(ent)) {
+                       if (!is_last_spte(ent, sp->role.level)) {
                                ent &= PT64_BASE_ADDR_MASK;
                                mmu_page_remove_parent_pte(page_header(ent),
                                                           &pt[i]);
                        } else {
-                               --kvm->stat.lpages;
+                               if (is_large_pte(ent))
+                                       --kvm->stat.lpages;
                                rmap_remove(kvm, &pt[i]);
                        }
                }
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
 {
        int i;
+       struct kvm_vcpu *vcpu;
 
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm->vcpus[i]->arch.last_pte_updated = NULL;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               vcpu->arch.last_pte_updated = NULL;
 }
 
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
                }
                BUG_ON(!parent_pte);
                kvm_mmu_put_page(sp, parent_pte);
-               set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
+               __set_spte(parent_pte, shadow_trap_nonpresent_pte);
        }
 }
 
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        int ret;
+
+       trace_kvm_mmu_zap_page(sp);
        ++kvm->stat.mmu_shadow_zapped;
        ret = mmu_zap_unsync_children(kvm, sp);
        kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
 
        for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
                if (pt[i] == shadow_notrap_nonpresent_pte)
-                       set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+                       __set_spte(&pt[i], shadow_trap_nonpresent_pte);
        }
 }
 
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        struct kvm_mmu_page *s;
        struct hlist_node *node, *n;
 
+       trace_kvm_mmu_unsync_page(sp);
        index = kvm_page_table_hashfn(sp->gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
        /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
        return 0;
 }
 
-static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int user_fault,
-                   int write_fault, int dirty, int largepage,
+                   int write_fault, int dirty, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync)
 {
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= shadow_nx_mask;
        if (pte_access & ACC_USER_MASK)
                spte |= shadow_user_mask;
-       if (largepage)
+       if (level > PT_PAGE_TABLE_LEVEL)
                spte |= PT_PAGE_SIZE_MASK;
        if (tdp_enabled)
                spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
        if ((pte_access & ACC_WRITE_MASK)
            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
 
-               if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+               if (level > PT_PAGE_TABLE_LEVEL &&
+                   has_wrprotected_page(vcpu->kvm, gfn, level)) {
                        ret = 1;
                        spte = shadow_trap_nonpresent_pte;
                        goto set_pte;
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                 * is responsibility of mmu_get_page / kvm_sync_page.
                 * Same reasoning can be applied to dirty page accounting.
                 */
-               if (!can_unsync && is_writeble_pte(*shadow_pte))
+               if (!can_unsync && is_writeble_pte(*sptep))
                        goto set_pte;
 
                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-       set_shadow_pte(shadow_pte, spte);
+       __set_spte(sptep, spte);
        return ret;
 }
 
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int largepage, gfn_t gfn,
+                        int *ptwrite, int level, gfn_t gfn,
                         pfn_t pfn, bool speculative)
 {
        int was_rmapped = 0;
-       int was_writeble = is_writeble_pte(*shadow_pte);
+       int was_writeble = is_writeble_pte(*sptep);
        int rmap_count;
 
        pgprintk("%s: spte %llx access %x write_fault %d"
                 " user_fault %d gfn %lx\n",
-                __func__, *shadow_pte, pt_access,
+                __func__, *sptep, pt_access,
                 write_fault, user_fault, gfn);
 
-       if (is_rmap_pte(*shadow_pte)) {
+       if (is_rmap_spte(*sptep)) {
                /*
                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
                 * the parent of the now unreachable PTE.
                 */
-               if (largepage && !is_large_pte(*shadow_pte)) {
+               if (level > PT_PAGE_TABLE_LEVEL &&
+                   !is_large_pte(*sptep)) {
                        struct kvm_mmu_page *child;
-                       u64 pte = *shadow_pte;
+                       u64 pte = *sptep;
 
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, shadow_pte);
-               } else if (pfn != spte_to_pfn(*shadow_pte)) {
+                       mmu_page_remove_parent_pte(child, sptep);
+               } else if (pfn != spte_to_pfn(*sptep)) {
                        pgprintk("hfn old %lx new %lx\n",
-                                spte_to_pfn(*shadow_pte), pfn);
-                       rmap_remove(vcpu->kvm, shadow_pte);
+                                spte_to_pfn(*sptep), pfn);
+                       rmap_remove(vcpu->kvm, sptep);
                } else
                        was_rmapped = 1;
        }
-       if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-                     dirty, largepage, gfn, pfn, speculative, true)) {
+
+       if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
+                     dirty, level, gfn, pfn, speculative, true)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_x86_ops->tlb_flush(vcpu);
        }
 
-       pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+       pgprintk("%s: setting spte %llx\n", __func__, *sptep);
        pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-                is_large_pte(*shadow_pte)? "2MB" : "4kB",
-                is_present_pte(*shadow_pte)?"RW":"R", gfn,
-                *shadow_pte, shadow_pte);
-       if (!was_rmapped && is_large_pte(*shadow_pte))
+                is_large_pte(*sptep)? "2MB" : "4kB",
+                *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+                *sptep, sptep);
+       if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
-       page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
+       page_header_update_slot(vcpu->kvm, sptep, gfn);
        if (!was_rmapped) {
-               rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
-               if (!is_rmap_pte(*shadow_pte))
+               rmap_count = rmap_add(vcpu, sptep, gfn);
+               if (!is_rmap_spte(*sptep))
                        kvm_release_pfn_clean(pfn);
                if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-                       rmap_recycle(vcpu, gfn, largepage);
+                       rmap_recycle(vcpu, sptep, gfn);
        } else {
                if (was_writeble)
                        kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                        kvm_release_pfn_clean(pfn);
        }
        if (speculative) {
-               vcpu->arch.last_pte_updated = shadow_pte;
+               vcpu->arch.last_pte_updated = sptep;
                vcpu->arch.last_pte_gfn = gfn;
        }
 }
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-                       int largepage, gfn_t gfn, pfn_t pfn)
+                       int level, gfn_t gfn, pfn_t pfn)
 {
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
        gfn_t pseudo_gfn;
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
-               if (iterator.level == PT_PAGE_TABLE_LEVEL
-                   || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+               if (iterator.level == level) {
                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
                                     0, write, 1, &pt_write,
-                                    largepage, gfn, pfn, false);
+                                    level, gfn, pfn, false);
                        ++vcpu->stat.pf_fixed;
                        break;
                }
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                return -ENOMEM;
                        }
 
-                       set_shadow_pte(iterator.sptep,
-                                      __pa(sp->spt)
-                                      | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                      | shadow_user_mask | shadow_x_mask);
+                       __set_spte(iterator.sptep,
+                                  __pa(sp->spt)
+                                  | PT_PRESENT_MASK | PT_WRITABLE_MASK
+                                  | shadow_user_mask | shadow_x_mask);
                }
        }
        return pt_write;
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
        int r;
-       int largepage = 0;
+       int level;
        pfn_t pfn;
        unsigned long mmu_seq;
 
-       if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
-               gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-               largepage = 1;
-       }
+       level = mapping_level(vcpu, gfn);
+
+       /*
+        * This path builds a PAE pagetable - so we can map 2mb pages at
+        * maximum. Therefore check if the level is larger than that.
+        */
+       if (level > PT_DIRECTORY_LEVEL)
+               level = PT_DIRECTORY_LEVEL;
+
+       gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
-       r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
+       r = __direct_map(vcpu, v, write, level, gfn, pfn);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
        gfn_t root_gfn;
        struct kvm_mmu_page *sp;
        int direct = 0;
+       u64 pdptr;
 
        root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
                ASSERT(!VALID_PAGE(root));
                if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-                       if (!is_present_pte(vcpu->arch.pdptrs[i])) {
+                       pdptr = kvm_pdptr_read(vcpu, i);
+                       if (!is_present_gpte(pdptr)) {
                                vcpu->arch.mmu.pae_root[i] = 0;
                                continue;
                        }
-                       root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
+                       root_gfn = pdptr >> PAGE_SHIFT;
                } else if (vcpu->arch.mmu.root_level == 0)
                        root_gfn = 0;
                if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 {
        pfn_t pfn;
        int r;
-       int largepage = 0;
+       int level;
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;
 
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
        if (r)
                return r;
 
-       if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
-               gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-               largepage = 1;
-       }
+       level = mapping_level(vcpu, gfn);
+
+       gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-                        largepage, gfn, pfn);
+                        level, gfn, pfn);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return r;
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
                context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51);
                context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
-               context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+               context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+                       rsvd_bits(maxphyaddr, 51) |
+                       rsvd_bits(13, 29);
                context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
                        rsvd_bits(maxphyaddr, 51) |
                        rsvd_bits(13, 20);              /* large page */
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        spin_unlock(&vcpu->kvm->mmu_lock);
        if (r)
                goto out;
+       /* set_cr3() should ensure TLB has been flushed */
        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-       kvm_mmu_flush_tlb(vcpu);
 out:
        return r;
 }
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 
        pte = *spte;
        if (is_shadow_present_pte(pte)) {
-               if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
-                   is_large_pte(pte))
+               if (is_last_spte(pte, sp->role.level))
                        rmap_remove(vcpu->kvm, spte);
                else {
                        child = page_header(pte & PT64_BASE_ADDR_MASK);
                        mmu_page_remove_parent_pte(child, spte);
                }
        }
-       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+       __set_spte(spte, shadow_trap_nonpresent_pte);
        if (is_large_pte(pte))
                --vcpu->kvm->stat.lpages;
 }
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
                                  const void *new)
 {
        if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
-               if (!vcpu->arch.update_pte.largepage ||
-                   sp->role.glevels == PT32_ROOT_LEVEL) {
-                       ++vcpu->kvm->stat.mmu_pde_zapped;
-                       return;
-               }
+               ++vcpu->kvm->stat.mmu_pde_zapped;
+               return;
         }
 
        ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        u64 gpte = 0;
        pfn_t pfn;
 
-       vcpu->arch.update_pte.largepage = 0;
-
        if (bytes != 4 && bytes != 8)
                return;
 
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                if ((bytes == 4) && (gpa % 4 == 0))
                        memcpy((void *)&gpte, new, 4);
        }
-       if (!is_present_pte(gpte))
+       if (!is_present_gpte(gpte))
                return;
        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-       if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
-               gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-               vcpu->arch.update_pte.largepage = 1;
-       }
        vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
        gpa_t gpa;
        int r;
 
+       if (tdp_enabled)
+               return 0;
+
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
 
        spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
+       while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+              !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
                struct kvm_mmu_page *sp;
 
                sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
                ++vcpu->stat.mmio_exits;
                return 0;
        case EMULATE_FAIL:
-               kvm_report_emulation_failure(vcpu, "pagetable");
-               return 1;
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+               return 0;
        default:
                BUG();
        }
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
        ASSERT(vcpu);
 
-       if (vcpu->kvm->arch.n_requested_mmu_pages)
-               vcpu->kvm->arch.n_free_mmu_pages =
-                                       vcpu->kvm->arch.n_requested_mmu_pages;
-       else
-               vcpu->kvm->arch.n_free_mmu_pages =
-                                       vcpu->kvm->arch.n_alloc_mmu_pages;
        /*
         * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
         * Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3100,24 @@ out:
        return r;
 }
 
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
+{
+       struct kvm_shadow_walk_iterator iterator;
+       int nr_sptes = 0;
+
+       spin_lock(&vcpu->kvm->mmu_lock);
+       for_each_shadow_entry(vcpu, addr, iterator) {
+               sptes[iterator.level-1] = *iterator.sptep;
+               nr_sptes++;
+               if (!is_shadow_present_pte(*iterator.sptep))
+                       break;
+       }
+       spin_unlock(&vcpu->kvm->mmu_lock);
+
+       return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
+
 #ifdef AUDIT
 
 static const char *audit_msg;
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva)
        return gva;
 }
 
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+                                u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+                           inspect_spte_fn fn)
+{
+       int i;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               u64 ent = sp->spt[i];
+
+               if (is_shadow_present_pte(ent)) {
+                       if (!is_last_spte(ent, sp->role.level)) {
+                               struct kvm_mmu_page *child;
+                               child = page_header(ent & PT64_BASE_ADDR_MASK);
+                               __mmu_spte_walk(kvm, child, fn);
+                       } else
+                               fn(kvm, sp, &sp->spt[i]);
+               }
+       }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+               sp = page_header(root);
+               __mmu_spte_walk(vcpu->kvm, sp, fn);
+               return;
+       }
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               if (root && VALID_PAGE(root)) {
+                       root &= PT64_BASE_ADDR_MASK;
+                       sp = page_header(root);
+                       __mmu_spte_walk(vcpu->kvm, sp, fn);
+               }
+       }
+       return;
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
                                gva_t va, int level)
 {
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
                        continue;
 
                va = canonicalize(va);
-               if (level > 1) {
-                       if (ent == shadow_notrap_nonpresent_pte)
-                               printk(KERN_ERR "audit: (%s) nontrapping pte"
-                                      " in nonleaf level: levels %d gva %lx"
-                                      " level %d pte %llx\n", audit_msg,
-                                      vcpu->arch.mmu.root_level, va, level, ent);
-                       else
-                               audit_mappings_page(vcpu, ent, va, level - 1);
-               } else {
+               if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
+                       audit_mappings_page(vcpu, ent, va, level - 1);
+               else {
                        gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
                        gfn_t gfn = gpa >> PAGE_SHIFT;
                        pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
                        hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
 
+                       if (is_error_pfn(pfn)) {
+                               kvm_release_pfn_clean(pfn);
+                               continue;
+                       }
+
                        if (is_shadow_present_pte(ent)
                            && (ent & PT64_BASE_ADDR_MASK) != hpa)
                                printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
                        d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
                        while (d) {
                                for (k = 0; k < RMAP_EXT; ++k)
-                                       if (d->shadow_ptes[k])
+                                       if (d->sptes[k])
                                                ++nmaps;
                                        else
                                                break;
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
        return nmaps;
 }
 
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+       unsigned long *rmapp;
+       struct kvm_mmu_page *rev_sp;
+       gfn_t gfn;
+
+       if (*sptep & PT_WRITABLE_MASK) {
+               rev_sp = page_header(__pa(sptep));
+               gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+               if (!gfn_to_memslot(kvm, gfn)) {
+                       if (!printk_ratelimit())
+                               return;
+                       printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+                                        audit_msg, gfn);
+                       printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+                                       audit_msg, sptep - rev_sp->spt,
+                                       rev_sp->gfn);
+                       dump_stack();
+                       return;
+               }
+
+               rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
+                                   is_large_pte(*sptep));
+               if (!*rmapp) {
+                       if (!printk_ratelimit())
+                               return;
+                       printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+                                        audit_msg, *sptep);
+                       dump_stack();
+               }
+       }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+       mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 {
-       int nmaps = 0;
        struct kvm_mmu_page *sp;
        int i;
 
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
                                continue;
                        if (!(ent & PT_WRITABLE_MASK))
                                continue;
-                       ++nmaps;
+                       inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
                }
        }
-       return nmaps;
+       return;
 }
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-       int n_rmap = count_rmaps(vcpu);
-       int n_actual = count_writable_mappings(vcpu);
-
-       if (n_rmap != n_actual)
-               printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-                      __func__, audit_msg, n_rmap, n_actual);
+       check_writable_mappings_rmap(vcpu);
+       count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
        struct kvm_mmu_page *sp;
        struct kvm_memory_slot *slot;
        unsigned long *rmapp;
+       u64 *spte;
        gfn_t gfn;
 
        list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
                if (sp->role.direct)
                        continue;
+               if (sp->unsync)
+                       continue;
 
                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
                slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
                rmapp = &slot->rmap[gfn - slot->base_gfn];
-               if (*rmapp)
-                       printk(KERN_ERR "%s: (%s) shadow page has writable"
-                              " mappings: gfn %lx role %x\n",
+
+               spte = rmap_next(vcpu->kvm, rmapp, NULL);
+               while (spte) {
+                       if (*spte & PT_WRITABLE_MASK)
+                               printk(KERN_ERR "%s: (%s) shadow page has "
+                               "writable mappings: gfn %lx role %x\n",
                               __func__, audit_msg, sp->gfn,
                               sp->role.word);
+                       spte = rmap_next(vcpu->kvm, rmapp, spte);
+               }
        }
 }
 
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
        audit_msg = msg;
        audit_rmap(vcpu);
        audit_write_protection(vcpu);
-       audit_mappings(vcpu);
+       if (strcmp("pre pte write", audit_msg) != 0)
+               audit_mappings(vcpu);
+       audit_writable_sptes_have_rmaps(vcpu);
        dbg = olddbg;
 }
 
index 3494a2f..61a1b38 100644 (file)
@@ -37,6 +37,8 @@
 #define PT32_ROOT_LEVEL 2
 #define PT32E_ROOT_LEVEL 3
 
+int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+
 static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
        if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr0 & X86_CR0_PG;
 }
 
-static inline int is_present_pte(unsigned long pte)
+static inline int is_present_gpte(unsigned long pte)
 {
        return pte & PT_PRESENT_MASK;
 }
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644 (file)
index 0000000..3e4a5c6
--- /dev/null
@@ -0,0 +1,220 @@
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/ftrace_event.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE mmutrace
+
+#define KVM_MMU_PAGE_FIELDS \
+       __field(__u64, gfn) \
+       __field(__u32, role) \
+       __field(__u32, root_count) \
+       __field(__u32, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp)                             \
+       __entry->gfn = sp->gfn;                      \
+       __entry->role = sp->role.word;               \
+       __entry->root_count = sp->root_count;        \
+       __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({                                       \
+       const char *ret = p->buffer + p->len;                           \
+       static const char *access_str[] = {                             \
+               "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"  \
+       };                                                              \
+       union kvm_mmu_page_role role;                                   \
+                                                                       \
+       role.word = __entry->role;                                      \
+                                                                       \
+       trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge"        \
+                        " %snxe root %u %s%c",                         \
+                        __entry->gfn, role.level, role.glevels,        \
+                        role.quadrant,                                 \
+                        role.direct ? " direct" : "",                  \
+                        access_str[role.access],                       \
+                        role.invalid ? " invalid" : "",                \
+                        role.cr4_pge ? "" : "!",                       \
+                        role.nxe ? "" : "!",                           \
+                        __entry->root_count,                           \
+                        __entry->unsync ? "unsync" : "sync", 0);       \
+       ret;                                                            \
+               })
+
+#define kvm_mmu_trace_pferr_flags       \
+       { PFERR_PRESENT_MASK, "P" },    \
+       { PFERR_WRITE_MASK, "W" },      \
+       { PFERR_USER_MASK, "U" },       \
+       { PFERR_RSVD_MASK, "RSVD" },    \
+       { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+       kvm_mmu_pagetable_walk,
+       TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
+       TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+
+       TP_STRUCT__entry(
+               __field(__u64, addr)
+               __field(__u32, pferr)
+       ),
+
+       TP_fast_assign(
+               __entry->addr = addr;
+               __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
+                                | (!!fetch_fault << 4);
+       ),
+
+       TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+                 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+       kvm_mmu_paging_element,
+       TP_PROTO(u64 pte, int level),
+       TP_ARGS(pte, level),
+
+       TP_STRUCT__entry(
+               __field(__u64, pte)
+               __field(__u32, level)
+               ),
+
+       TP_fast_assign(
+               __entry->pte = pte;
+               __entry->level = level;
+               ),
+
+       TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+/* We set a pte accessed bit */
+TRACE_EVENT(
+       kvm_mmu_set_accessed_bit,
+       TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+       TP_ARGS(table_gfn, index, size),
+
+       TP_STRUCT__entry(
+               __field(__u64, gpa)
+               ),
+
+       TP_fast_assign(
+               __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+                               + index * size;
+               ),
+
+       TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte dirty bit */
+TRACE_EVENT(
+       kvm_mmu_set_dirty_bit,
+       TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+       TP_ARGS(table_gfn, index, size),
+
+       TP_STRUCT__entry(
+               __field(__u64, gpa)
+               ),
+
+       TP_fast_assign(
+               __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+                               + index * size;
+               ),
+
+       TP_printk("gpa %llx", __entry->gpa)
+);
+
+TRACE_EVENT(
+       kvm_mmu_walker_error,
+       TP_PROTO(u32 pferr),
+       TP_ARGS(pferr),
+
+       TP_STRUCT__entry(
+               __field(__u32, pferr)
+               ),
+
+       TP_fast_assign(
+               __entry->pferr = pferr;
+               ),
+
+       TP_printk("pferr %x %s", __entry->pferr,
+                 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+       kvm_mmu_get_page,
+       TP_PROTO(struct kvm_mmu_page *sp, bool created),
+       TP_ARGS(sp, created),
+
+       TP_STRUCT__entry(
+               KVM_MMU_PAGE_FIELDS
+               __field(bool, created)
+               ),
+
+       TP_fast_assign(
+               KVM_MMU_PAGE_ASSIGN(sp)
+               __entry->created = created;
+               ),
+
+       TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+                 __entry->created ? "new" : "existing")
+);
+
+TRACE_EVENT(
+       kvm_mmu_sync_page,
+       TP_PROTO(struct kvm_mmu_page *sp),
+       TP_ARGS(sp),
+
+       TP_STRUCT__entry(
+               KVM_MMU_PAGE_FIELDS
+               ),
+
+       TP_fast_assign(
+               KVM_MMU_PAGE_ASSIGN(sp)
+               ),
+
+       TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+       kvm_mmu_unsync_page,
+       TP_PROTO(struct kvm_mmu_page *sp),
+       TP_ARGS(sp),
+
+       TP_STRUCT__entry(
+               KVM_MMU_PAGE_FIELDS
+               ),
+
+       TP_fast_assign(
+               KVM_MMU_PAGE_ASSIGN(sp)
+               ),
+
+       TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+TRACE_EVENT(
+       kvm_mmu_zap_page,
+       TP_PROTO(struct kvm_mmu_page *sp),
+       TP_ARGS(sp),
+
+       TP_STRUCT__entry(
+               KVM_MMU_PAGE_FIELDS
+               ),
+
+       TP_fast_assign(
+               KVM_MMU_PAGE_ASSIGN(sp)
+               ),
+
+       TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 67785f6..d2fec9c 100644 (file)
@@ -27,7 +27,8 @@
        #define guest_walker guest_walker64
        #define FNAME(name) paging##64_##name
        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
+       #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+       #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
@@ -43,7 +44,8 @@
        #define guest_walker guest_walker32
        #define FNAME(name) paging##32_##name
        #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
-       #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
+       #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+       #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
@@ -53,8 +55,8 @@
        #error Invalid PTTYPE value
 #endif
 
-#define gpte_to_gfn FNAME(gpte_to_gfn)
-#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 
 /*
  * The guest_walker structure emulates the behavior of the hardware page
@@ -71,14 +73,9 @@ struct guest_walker {
        u32 error_code;
 };
 
-static gfn_t gpte_to_gfn(pt_element_t gpte)
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 {
-       return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
-{
-       return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+       return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
 static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
        gpa_t pte_gpa;
        int rsvd_fault = 0;
 
-       pgprintk("%s: addr %lx\n", __func__, addr);
+       trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
+                                    fetch_fault);
 walk:
        walker->level = vcpu->arch.mmu.root_level;
        pte = vcpu->arch.cr3;
 #if PTTYPE == 64
        if (!is_long_mode(vcpu)) {
-               pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
-               if (!is_present_pte(pte))
+               pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
+               trace_kvm_mmu_paging_element(pte, walker->level);
+               if (!is_present_gpte(pte))
                        goto not_present;
                --walker->level;
        }
@@ -150,12 +149,11 @@ walk:
                pte_gpa += index * sizeof(pt_element_t);
                walker->table_gfn[walker->level - 1] = table_gfn;
                walker->pte_gpa[walker->level - 1] = pte_gpa;
-               pgprintk("%s: table_gfn[%d] %lx\n", __func__,
-                        walker->level - 1, table_gfn);
 
                kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+               trace_kvm_mmu_paging_element(pte, walker->level);
 
-               if (!is_present_pte(pte))
+               if (!is_present_gpte(pte))
                        goto not_present;
 
                rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -175,6 +173,8 @@ walk:
 #endif
 
                if (!(pte & PT_ACCESSED_MASK)) {
+                       trace_kvm_mmu_set_accessed_bit(table_gfn, index,
+                                                      sizeof(pte));
                        mark_page_dirty(vcpu->kvm, table_gfn);
                        if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
                            index, pte, pte|PT_ACCESSED_MASK))
@@ -186,18 +186,24 @@ walk:
 
                walker->ptes[walker->level - 1] = pte;
 
-               if (walker->level == PT_PAGE_TABLE_LEVEL) {
-                       walker->gfn = gpte_to_gfn(pte);
-                       break;
-               }
-
-               if (walker->level == PT_DIRECTORY_LEVEL
-                   && (pte & PT_PAGE_SIZE_MASK)
-                   && (PTTYPE == 64 || is_pse(vcpu))) {
-                       walker->gfn = gpte_to_gfn_pde(pte);
-                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
-                       if (PTTYPE == 32 && is_cpuid_PSE36())
+               if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
+                   ((walker->level == PT_DIRECTORY_LEVEL) &&
+                               (pte & PT_PAGE_SIZE_MASK)  &&
+                               (PTTYPE == 64 || is_pse(vcpu))) ||
+                   ((walker->level == PT_PDPE_LEVEL) &&
+                               (pte & PT_PAGE_SIZE_MASK)  &&
+                               is_long_mode(vcpu))) {
+                       int lvl = walker->level;
+
+                       walker->gfn = gpte_to_gfn_lvl(pte, lvl);
+                       walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
+                                       >> PAGE_SHIFT;
+
+                       if (PTTYPE == 32 &&
+                           walker->level == PT_DIRECTORY_LEVEL &&
+                           is_cpuid_PSE36())
                                walker->gfn += pse36_gfn_delta(pte);
+
                        break;
                }
 
@@ -205,9 +211,10 @@ walk:
                --walker->level;
        }
 
-       if (write_fault && !is_dirty_pte(pte)) {
+       if (write_fault && !is_dirty_gpte(pte)) {
                bool ret;
 
+               trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
                mark_page_dirty(vcpu->kvm, table_gfn);
                ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
                            pte|PT_DIRTY_MASK);
@@ -239,6 +246,7 @@ err:
                walker->error_code |= PFERR_FETCH_MASK;
        if (rsvd_fault)
                walker->error_code |= PFERR_RSVD_MASK;
+       trace_kvm_mmu_walker_error(walker->error_code);
        return 0;
 }
 
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
        pt_element_t gpte;
        unsigned pte_access;
        pfn_t pfn;
-       int largepage = vcpu->arch.update_pte.largepage;
 
        gpte = *(const pt_element_t *)pte;
        if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-               if (!is_present_pte(gpte))
-                       set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
+               if (!is_present_gpte(gpte))
+                       __set_spte(spte, shadow_notrap_nonpresent_pte);
                return;
        }
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
                return;
        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                    gpte & PT_DIRTY_MASK, NULL, largepage,
+                    gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
                     gpte_to_gfn(gpte), pfn, true);
 }
 
@@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
  */
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
-                        int user_fault, int write_fault, int largepage,
+                        int user_fault, int write_fault, int hlevel,
                         int *ptwrite, pfn_t pfn)
 {
        unsigned access = gw->pt_access;
@@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        pt_element_t curr_pte;
        struct kvm_shadow_walk_iterator iterator;
 
-       if (!is_present_pte(gw->ptes[gw->level - 1]))
+       if (!is_present_gpte(gw->ptes[gw->level - 1]))
                return NULL;
 
        for_each_shadow_entry(vcpu, addr, iterator) {
                level = iterator.level;
                sptep = iterator.sptep;
-               if (level == PT_PAGE_TABLE_LEVEL
-                   || (largepage && level == PT_DIRECTORY_LEVEL)) {
+               if (iterator.level == hlevel) {
                        mmu_set_spte(vcpu, sptep, access,
                                     gw->pte_access & access,
                                     user_fault, write_fault,
                                     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-                                    ptwrite, largepage,
+                                    ptwrite, level,
                                     gw->gfn, pfn, false);
                        break;
                }
@@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
                if (is_large_pte(*sptep)) {
                        rmap_remove(vcpu->kvm, sptep);
-                       set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+                       __set_spte(sptep, shadow_trap_nonpresent_pte);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                }
 
-               if (level == PT_DIRECTORY_LEVEL
-                   && gw->level == PT_DIRECTORY_LEVEL) {
+               if (level <= gw->level) {
+                       int delta = level - gw->level + 1;
                        direct = 1;
-                       if (!is_dirty_pte(gw->ptes[level - 1]))
+                       if (!is_dirty_gpte(gw->ptes[level - delta]))
                                access &= ~ACC_WRITE_MASK;
-                       table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+                       table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
+                       /* advance table_gfn when emulating 1gb pages with 4k */
+                       if (delta == 0)
+                               table_gfn += PT_INDEX(addr, level);
                } else {
                        direct = 0;
                        table_gfn = gw->table_gfn[level - 2];
@@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        int user_fault = error_code & PFERR_USER_MASK;
        int fetch_fault = error_code & PFERR_FETCH_MASK;
        struct guest_walker walker;
-       u64 *shadow_pte;
+       u64 *sptep;
        int write_pt = 0;
        int r;
        pfn_t pfn;
-       int largepage = 0;
+       int level = PT_PAGE_TABLE_LEVEL;
        unsigned long mmu_seq;
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                return 0;
        }
 
-       if (walker.level == PT_DIRECTORY_LEVEL) {
-               gfn_t large_gfn;
-               large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
-               if (is_largepage_backed(vcpu, large_gfn)) {
-                       walker.gfn = large_gfn;
-                       largepage = 1;
-               }
+       if (walker.level >= PT_DIRECTORY_LEVEL) {
+               level = min(walker.level, mapping_level(vcpu, walker.gfn));
+               walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
        }
+
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
@@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
-       shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                                 largepage, &write_pt, pfn);
-
+       sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+                            level, &write_pt, pfn);
        pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
-                shadow_pte, *shadow_pte, write_pt);
+                sptep, *sptep, write_pt);
 
        if (!write_pt)
                vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                sptep = iterator.sptep;
 
                /* FIXME: properly handle invlpg on large guest pages */
-               if (level == PT_PAGE_TABLE_LEVEL ||
-                   ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+               if (level == PT_PAGE_TABLE_LEVEL  ||
+                   ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
+                   ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
                        struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
                        pte_gpa = (sp->gfn << PAGE_SHIFT);
@@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                                        --vcpu->kvm->stat.lpages;
                                need_flush = 1;
                        }
-                       set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+                       __set_spte(sptep, shadow_trap_nonpresent_pte);
                        break;
                }
 
@@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
        if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
                                  sizeof(pt_element_t)))
                return;
-       if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+       if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
                if (mmu_topup_memory_caches(vcpu))
                        return;
                kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
                r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
                pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
                for (j = 0; j < ARRAY_SIZE(pt); ++j)
-                       if (r || is_present_pte(pt[j]))
+                       if (r || is_present_gpte(pt[j]))
                                sp->spt[i+j] = shadow_trap_nonpresent_pte;
                        else
                                sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
 
-               if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+               if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
                    !(gpte & PT_ACCESSED_MASK)) {
                        u64 nonpresent;
 
                        rmap_remove(vcpu->kvm, &sp->spt[i]);
-                       if (is_present_pte(gpte))
+                       if (is_present_gpte(gpte))
                                nonpresent = shadow_trap_nonpresent_pte;
                        else
                                nonpresent = shadow_notrap_nonpresent_pte;
-                       set_shadow_pte(&sp->spt[i], nonpresent);
+                       __set_spte(&sp->spt[i], nonpresent);
                        continue;
                }
 
                nr_present++;
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-                        is_dirty_pte(gpte), 0, gfn,
+                        is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
                         spte_to_pfn(sp->spt[i]), true, false);
        }
 
@@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
 #undef PT_LEVEL_MASK
-#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
 #undef PT_LEVEL_BITS
 #undef PT_MAX_FULL_LEVELS
 #undef gpte_to_gfn
-#undef gpte_to_gfn_pde
+#undef gpte_to_gfn_lvl
 #undef CMPXCHG
index b1f658a..944cc9c 100644 (file)
@@ -15,7 +15,6 @@
  */
 #include <linux/kvm_host.h>
 
-#include "kvm_svm.h"
 #include "irq.h"
 #include "mmu.h"
 #include "kvm_cache_regs.h"
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
+#include <linux/ftrace_event.h>
 
 #include <asm/desc.h>
 
 #include <asm/virtext.h>
+#include "trace.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
 
+#define NESTED_EXIT_HOST       0       /* Exit handled on host level */
+#define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
+#define NESTED_EXIT_CONTINUE   2       /* Further checks needed      */
+
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
 /* Turn on to get debugging output*/
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL");
 #define nsvm_printk(fmt, args...) do {} while(0)
 #endif
 
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+       MSR_FS_BASE,
+#endif
+       MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct nested_state {
+       struct vmcb *hsave;
+       u64 hsave_msr;
+       u64 vmcb;
+
+       /* These are the merged vectors */
+       u32 *msrpm;
+
+       /* gpa pointers to the real vectors */
+       u64 vmcb_msrpm;
+
+       /* cache for intercepts of the guest */
+       u16 intercept_cr_read;
+       u16 intercept_cr_write;
+       u16 intercept_dr_read;
+       u16 intercept_dr_write;
+       u32 intercept_exceptions;
+       u64 intercept;
+
+};
+
+struct vcpu_svm {
+       struct kvm_vcpu vcpu;
+       struct vmcb *vmcb;
+       unsigned long vmcb_pa;
+       struct svm_cpu_data *svm_data;
+       uint64_t asid_generation;
+       uint64_t sysenter_esp;
+       uint64_t sysenter_eip;
+
+       u64 next_rip;
+
+       u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+       u64 host_gs_base;
+
+       u32 *msrpm;
+
+       struct nested_state nested;
+};
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -67,15 +124,14 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 0;
+static int nested = 1;
 module_param(nested, int, S_IRUGO);
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
 
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
-                            void *arg2, void *opaque);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                                      bool has_error_code, u32 error_code);
 
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 
 static inline bool is_nested(struct vcpu_svm *svm)
 {
-       return svm->nested_vmcb;
+       return svm->nested.vmcb;
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+       svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+       svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+       return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static unsigned long iopm_base;
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
        asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
 }
 
-static inline unsigned long kvm_read_cr2(void)
-{
-       unsigned long cr2;
-
-       asm volatile ("mov %%cr2, %0" : "=r" (cr2));
-       return cr2;
-}
-
-static inline void kvm_write_cr2(unsigned long val)
-{
-       asm volatile ("mov %0, %%cr2" :: "r" (val));
-}
-
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
 {
        to_svm(vcpu)->asid_generation--;
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage)
 
        struct svm_cpu_data *svm_data;
        uint64_t efer;
-       struct desc_ptr gdt_descr;
+       struct descriptor_table gdt_descr;
        struct desc_struct *gdt;
        int me = raw_smp_processor_id();
 
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage)
        svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
        svm_data->next_asid = svm_data->max_asid + 1;
 
-       asm volatile ("sgdt %0" : "=m"(gdt_descr));
-       gdt = (struct desc_struct *)gdt_descr.address;
+       kvm_get_gdt(&gdt_descr);
+       gdt = (struct desc_struct *)gdt_descr.base;
        svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
        rdmsrl(MSR_EFER, efer);
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
 #endif
        set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
        set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
-       set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
-       set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
 }
 
 static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm)
        }
        force_new_asid(&svm->vcpu);
 
-       svm->nested_vmcb = 0;
-       svm->vcpu.arch.hflags = HF_GIF_MASK;
+       svm->nested.vmcb = 0;
+       svm->vcpu.arch.hflags = 0;
+
+       enable_gif(svm);
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
        init_vmcb(svm);
 
-       if (vcpu->vcpu_id != 0) {
+       if (!kvm_vcpu_is_bsp(vcpu)) {
                kvm_rip_write(vcpu, 0);
                svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
                svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        hsave_page = alloc_page(GFP_KERNEL);
        if (!hsave_page)
                goto uninit;
-       svm->hsave = page_address(hsave_page);
+       svm->nested.hsave = page_address(hsave_page);
 
-       svm->nested_msrpm = page_address(nested_msrpm_pages);
+       svm->nested.msrpm = page_address(nested_msrpm_pages);
 
        svm->vmcb = page_address(page);
        clear_page(svm->vmcb);
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        fx_init(&svm->vcpu);
        svm->vcpu.fpu_active = 1;
        svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
-       if (svm->vcpu.vcpu_id == 0)
+       if (kvm_vcpu_is_bsp(&svm->vcpu))
                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
        return &svm->vcpu;
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
-       __free_page(virt_to_page(svm->hsave));
-       __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
+       __free_page(virt_to_page(svm->nested.hsave));
+       __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
        to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+       switch (reg) {
+       case VCPU_EXREG_PDPTR:
+               BUG_ON(!npt_enabled);
+               load_pdptrs(vcpu, vcpu->arch.cr3);
+               break;
+       default:
+               BUG();
+       }
+}
+
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
        svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
                val = 0;
        }
 
-       KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
        return val;
 }
 
@@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
-
        *exception = 0;
 
        switch (dr) {
@@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        fault_address  = svm->vmcb->control.exit_info_2;
        error_code = svm->vmcb->control.exit_info_1;
 
-       if (!npt_enabled)
-               KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
-                           (u32)fault_address, (u32)(fault_address >> 32),
-                           handler);
-       else
-               KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
-                           (u32)fault_address, (u32)(fault_address >> 32),
-                           handler);
-       /*
-        * FIXME: Tis shouldn't be necessary here, but there is a flush
-        * missing in the MMU code. Until we find this bug, flush the
-        * complete TLB here on an NPF
-        */
-       if (npt_enabled)
-               svm_flush_tlb(&svm->vcpu);
-       else {
-               if (kvm_event_needs_reinjection(&svm->vcpu))
-                       kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
-       }
+       trace_kvm_page_fault(fault_address, error_code);
+       if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+               kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
        return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
@@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       KVMTRACE_0D(NMI, &svm->vcpu, handler);
        return 1;
 }
 
 static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
        ++svm->vcpu.stat.irq_exits;
-       KVMTRACE_0D(INTR, &svm->vcpu, handler);
        return 1;
 }
 
@@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                                      bool has_error_code, u32 error_code)
 {
-       if (is_nested(svm)) {
-               svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-               svm->vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1 = error_code;
-               svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-               if (nested_svm_exit_handled(svm, false)) {
-                       nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
-
-                       nested_svm_vmexit(svm);
-                       return 1;
-               }
-       }
+       if (!is_nested(svm))
+               return 0;
 
-       return 0;
+       svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+       svm->vmcb->control.exit_code_hi = 0;
+       svm->vmcb->control.exit_info_1 = error_code;
+       svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+       return nested_svm_exit_handled(svm);
 }
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
 {
-       if (is_nested(svm)) {
-               if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-                       return 0;
+       if (!is_nested(svm))
+               return 0;
 
-               if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
-                       return 0;
+       if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+               return 0;
 
-               svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+       if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+               return 0;
 
-               if (nested_svm_exit_handled(svm, false)) {
-                       nsvm_printk("VMexit -> INTR\n");
-                       nested_svm_vmexit(svm);
-                       return 1;
-               }
+       svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+       if (nested_svm_exit_handled(svm)) {
+               nsvm_printk("VMexit -> INTR\n");
+               return 1;
        }
 
        return 0;
 }
 
-static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
 {
        struct page *page;
 
@@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
        page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
        up_read(&current->mm->mmap_sem);
 
-       if (is_error_page(page)) {
-               printk(KERN_INFO "%s: could not find page at 0x%llx\n",
-                      __func__, gpa);
-               kvm_release_page_clean(page);
-               kvm_inject_gp(&svm->vcpu, 0);
-               return NULL;
-       }
-       return page;
+       if (is_error_page(page))
+               goto error;
+
+       return kmap_atomic(page, idx);
+
+error:
+       kvm_release_page_clean(page);
+       kvm_inject_gp(&svm->vcpu, 0);
+
+       return NULL;
 }
 
-static int nested_svm_do(struct vcpu_svm *svm,
-                        u64 arg1_gpa, u64 arg2_gpa, void *opaque,
-                        int (*handler)(struct vcpu_svm *svm,
-                                       void *arg1,
-                                       void *arg2,
-                                       void *opaque))
+static void nested_svm_unmap(void *addr, enum km_type idx)
 {
-       struct page *arg1_page;
-       struct page *arg2_page = NULL;
-       void *arg1;
-       void *arg2 = NULL;
-       int retval;
+       struct page *page;
 
-       arg1_page = nested_svm_get_page(svm, arg1_gpa);
-       if(arg1_page == NULL)
-               return 1;
+       if (!addr)
+               return;
 
-       if (arg2_gpa) {
-               arg2_page = nested_svm_get_page(svm, arg2_gpa);
-               if(arg2_page == NULL) {
-                       kvm_release_page_clean(arg1_page);
-                       return 1;
-               }
-       }
+       page = kmap_atomic_to_page(addr);
+
+       kunmap_atomic(addr, idx);
+       kvm_release_page_dirty(page);
+}
+
+static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+       u32 param = svm->vmcb->control.exit_info_1 & 1;
+       u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+       bool ret = false;
+       u32 t0, t1;
+       u8 *msrpm;
 
-       arg1 = kmap_atomic(arg1_page, KM_USER0);
-       if (arg2_gpa)
-               arg2 = kmap_atomic(arg2_page, KM_USER1);
+       if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+               return false;
 
-       retval = handler(svm, arg1, arg2, opaque);
+       msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+
+       if (!msrpm)
+               goto out;
+
+       switch (msr) {
+       case 0 ... 0x1fff:
+               t0 = (msr * 2) % 8;
+               t1 = msr / 8;
+               break;
+       case 0xc0000000 ... 0xc0001fff:
+               t0 = (8192 + msr - 0xc0000000) * 2;
+               t1 = (t0 / 8);
+               t0 %= 8;
+               break;
+       case 0xc0010000 ... 0xc0011fff:
+               t0 = (16384 + msr - 0xc0010000) * 2;
+               t1 = (t0 / 8);
+               t0 %= 8;
+               break;
+       default:
+               ret = true;
+               goto out;
+       }
 
-       kunmap_atomic(arg1, KM_USER0);
-       if (arg2_gpa)
-               kunmap_atomic(arg2, KM_USER1);
+       ret = msrpm[t1] & ((1 << param) << t0);
 
-       kvm_release_page_dirty(arg1_page);
-       if (arg2_gpa)
-               kvm_release_page_dirty(arg2_page);
+out:
+       nested_svm_unmap(msrpm, KM_USER0);
 
-       return retval;
+       return ret;
 }
 
-static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
-                                       void *arg1,
-                                       void *arg2,
-                                       void *opaque)
+static int nested_svm_exit_special(struct vcpu_svm *svm)
 {
-       struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-       bool kvm_overrides = *(bool *)opaque;
        u32 exit_code = svm->vmcb->control.exit_code;
 
-       if (kvm_overrides) {
-               switch (exit_code) {
-               case SVM_EXIT_INTR:
-               case SVM_EXIT_NMI:
-                       return 0;
+       switch (exit_code) {
+       case SVM_EXIT_INTR:
+       case SVM_EXIT_NMI:
+               return NESTED_EXIT_HOST;
                /* For now we are always handling NPFs when using them */
-               case SVM_EXIT_NPF:
-                       if (npt_enabled)
-                               return 0;
-                       break;
-               /* When we're shadowing, trap PFs */
-               case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-                       if (!npt_enabled)
-                               return 0;
-                       break;
-               default:
-                       break;
-               }
+       case SVM_EXIT_NPF:
+               if (npt_enabled)
+                       return NESTED_EXIT_HOST;
+               break;
+       /* When we're shadowing, trap PFs */
+       case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+               if (!npt_enabled)
+                       return NESTED_EXIT_HOST;
+               break;
+       default:
+               break;
        }
 
+       return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+       u32 exit_code = svm->vmcb->control.exit_code;
+       int vmexit = NESTED_EXIT_HOST;
+
        switch (exit_code) {
+       case SVM_EXIT_MSR:
+               vmexit = nested_svm_exit_handled_msr(svm);
+               break;
        case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
                u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
-               if (nested_vmcb->control.intercept_cr_read & cr_bits)
-                       return 1;
+               if (svm->nested.intercept_cr_read & cr_bits)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
                u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
-               if (nested_vmcb->control.intercept_cr_write & cr_bits)
-                       return 1;
+               if (svm->nested.intercept_cr_write & cr_bits)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
                u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
-               if (nested_vmcb->control.intercept_dr_read & dr_bits)
-                       return 1;
+               if (svm->nested.intercept_dr_read & dr_bits)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
                u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
-               if (nested_vmcb->control.intercept_dr_write & dr_bits)
-                       return 1;
+               if (svm->nested.intercept_dr_write & dr_bits)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
-               if (nested_vmcb->control.intercept_exceptions & excp_bits)
-                       return 1;
+               if (svm->nested.intercept_exceptions & excp_bits)
+                       vmexit = NESTED_EXIT_DONE;
                break;
        }
        default: {
                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
                nsvm_printk("exit code: 0x%x\n", exit_code);
-               if (nested_vmcb->control.intercept & exit_bits)
-                       return 1;
+               if (svm->nested.intercept & exit_bits)
+                       vmexit = NESTED_EXIT_DONE;
        }
        }
 
-       return 0;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
-                                      void *arg1, void *arg2,
-                                      void *opaque)
-{
-       struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-       u8 *msrpm = (u8 *)arg2;
-        u32 t0, t1;
-       u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       u32 param = svm->vmcb->control.exit_info_1 & 1;
-
-       if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-               return 0;
-
-       switch(msr) {
-       case 0 ... 0x1fff:
-               t0 = (msr * 2) % 8;
-               t1 = msr / 8;
-               break;
-       case 0xc0000000 ... 0xc0001fff:
-               t0 = (8192 + msr - 0xc0000000) * 2;
-               t1 = (t0 / 8);
-               t0 %= 8;
-               break;
-       case 0xc0010000 ... 0xc0011fff:
-               t0 = (16384 + msr - 0xc0010000) * 2;
-               t1 = (t0 / 8);
-               t0 %= 8;
-               break;
-       default:
-               return 1;
-               break;
+       if (vmexit == NESTED_EXIT_DONE) {
+               nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
+               nested_svm_vmexit(svm);
        }
-       if (msrpm[t1] & ((1 << param) << t0))
-               return 1;
 
-       return 0;
+       return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+       struct vmcb_control_area *dst  = &dst_vmcb->control;
+       struct vmcb_control_area *from = &from_vmcb->control;
+
+       dst->intercept_cr_read    = from->intercept_cr_read;
+       dst->intercept_cr_write   = from->intercept_cr_write;
+       dst->intercept_dr_read    = from->intercept_dr_read;
+       dst->intercept_dr_write   = from->intercept_dr_write;
+       dst->intercept_exceptions = from->intercept_exceptions;
+       dst->intercept            = from->intercept;
+       dst->iopm_base_pa         = from->iopm_base_pa;
+       dst->msrpm_base_pa        = from->msrpm_base_pa;
+       dst->tsc_offset           = from->tsc_offset;
+       dst->asid                 = from->asid;
+       dst->tlb_ctl              = from->tlb_ctl;
+       dst->int_ctl              = from->int_ctl;
+       dst->int_vector           = from->int_vector;
+       dst->int_state            = from->int_state;
+       dst->exit_code            = from->exit_code;
+       dst->exit_code_hi         = from->exit_code_hi;
+       dst->exit_info_1          = from->exit_info_1;
+       dst->exit_info_2          = from->exit_info_2;
+       dst->exit_int_info        = from->exit_int_info;
+       dst->exit_int_info_err    = from->exit_int_info_err;
+       dst->nested_ctl           = from->nested_ctl;
+       dst->event_inj            = from->event_inj;
+       dst->event_inj_err        = from->event_inj_err;
+       dst->nested_cr3           = from->nested_cr3;
+       dst->lbr_ctl              = from->lbr_ctl;
 }
 
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-       bool k = kvm_override;
-
-       switch (svm->vmcb->control.exit_code) {
-       case SVM_EXIT_MSR:
-               return nested_svm_do(svm, svm->nested_vmcb,
-                                    svm->nested_vmcb_msrpm, NULL,
-                                    nested_svm_exit_handled_msr);
-       default: break;
-       }
+       struct vmcb *nested_vmcb;
+       struct vmcb *hsave = svm->nested.hsave;
+       struct vmcb *vmcb = svm->vmcb;
 
-       return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
-                            nested_svm_exit_handled_real);
-}
-
-static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
-                                 void *arg2, void *opaque)
-{
-       struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-       struct vmcb *hsave = svm->hsave;
-       u64 nested_save[] = { nested_vmcb->save.cr0,
-                             nested_vmcb->save.cr3,
-                             nested_vmcb->save.cr4,
-                             nested_vmcb->save.efer,
-                             nested_vmcb->control.intercept_cr_read,
-                             nested_vmcb->control.intercept_cr_write,
-                             nested_vmcb->control.intercept_dr_read,
-                             nested_vmcb->control.intercept_dr_write,
-                             nested_vmcb->control.intercept_exceptions,
-                             nested_vmcb->control.intercept,
-                             nested_vmcb->control.msrpm_base_pa,
-                             nested_vmcb->control.iopm_base_pa,
-                             nested_vmcb->control.tsc_offset };
+       nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+       if (!nested_vmcb)
+               return 1;
 
        /* Give the current vmcb to the guest */
-       memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
-       nested_vmcb->save.cr0 = nested_save[0];
-       if (!npt_enabled)
-               nested_vmcb->save.cr3 = nested_save[1];
-       nested_vmcb->save.cr4 = nested_save[2];
-       nested_vmcb->save.efer = nested_save[3];
-       nested_vmcb->control.intercept_cr_read = nested_save[4];
-       nested_vmcb->control.intercept_cr_write = nested_save[5];
-       nested_vmcb->control.intercept_dr_read = nested_save[6];
-       nested_vmcb->control.intercept_dr_write = nested_save[7];
-       nested_vmcb->control.intercept_exceptions = nested_save[8];
-       nested_vmcb->control.intercept = nested_save[9];
-       nested_vmcb->control.msrpm_base_pa = nested_save[10];
-       nested_vmcb->control.iopm_base_pa = nested_save[11];
-       nested_vmcb->control.tsc_offset = nested_save[12];
+       disable_gif(svm);
+
+       nested_vmcb->save.es     = vmcb->save.es;
+       nested_vmcb->save.cs     = vmcb->save.cs;
+       nested_vmcb->save.ss     = vmcb->save.ss;
+       nested_vmcb->save.ds     = vmcb->save.ds;
+       nested_vmcb->save.gdtr   = vmcb->save.gdtr;
+       nested_vmcb->save.idtr   = vmcb->save.idtr;
+       if (npt_enabled)
+               nested_vmcb->save.cr3    = vmcb->save.cr3;
+       nested_vmcb->save.cr2    = vmcb->save.cr2;
+       nested_vmcb->save.rflags = vmcb->save.rflags;
+       nested_vmcb->save.rip    = vmcb->save.rip;
+       nested_vmcb->save.rsp    = vmcb->save.rsp;
+       nested_vmcb->save.rax    = vmcb->save.rax;
+       nested_vmcb->save.dr7    = vmcb->save.dr7;
+       nested_vmcb->save.dr6    = vmcb->save.dr6;
+       nested_vmcb->save.cpl    = vmcb->save.cpl;
+
+       nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
+       nested_vmcb->control.int_vector        = vmcb->control.int_vector;
+       nested_vmcb->control.int_state         = vmcb->control.int_state;
+       nested_vmcb->control.exit_code         = vmcb->control.exit_code;
+       nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
+       nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
+       nested_vmcb->control.exi