Merge branch 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Linus Torvalds [Sun, 24 Jul 2011 16:07:03 +0000 (09:07 -0700)]
* 'kvm-updates/3.1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (143 commits)
  KVM: IOMMU: Disable device assignment without interrupt remapping
  KVM: MMU: trace mmio page fault
  KVM: MMU: mmio page fault support
  KVM: MMU: reorganize struct kvm_shadow_walk_iterator
  KVM: MMU: lockless walking shadow page table
  KVM: MMU: do not need atomicly to set/clear spte
  KVM: MMU: introduce the rules to modify shadow page table
  KVM: MMU: abstract some functions to handle fault pfn
  KVM: MMU: filter out the mmio pfn from the fault pfn
  KVM: MMU: remove bypass_guest_pf
  KVM: MMU: split kvm_mmu_free_page
  KVM: MMU: count used shadow pages on prepareing path
  KVM: MMU: rename 'pt_write' to 'emulate'
  KVM: MMU: cleanup for FNAME(fetch)
  KVM: MMU: optimize to handle dirty bit
  KVM: MMU: cache mmio info on page fault path
  KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code
  KVM: MMU: do not update slot bitmap if spte is nonpresent
  KVM: MMU: fix walking shadow page table
  KVM guest: KVM Steal time registration
  ...

102 files changed:
Documentation/kernel-parameters.txt
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/mmu.txt
Documentation/virtual/kvm/msr.txt
Documentation/virtual/kvm/nested-vmx.txt [new file with mode: 0644]
Documentation/virtual/kvm/ppc-pv.txt
arch/ia64/include/asm/paravirt.h
arch/ia64/kernel/paravirt.c
arch/powerpc/include/asm/cputable.h
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kvm.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_e500.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu-hash64.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/ppc_asm.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/reg_booke.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_power7.S
arch/powerpc/kernel/cpu_setup_ppc970.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_fsl_booke.S
arch/powerpc/kernel/idle_power7.S
arch/powerpc/kernel/paca.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/traps.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu.c
arch/powerpc/kvm/book3s_64_mmu_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_vio_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_exports.c
arch/powerpc/kvm/book3s_hv.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_builtin.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_interrupts.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_rm_mmu.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_rmhandlers.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_interrupts.S
arch/powerpc/kvm/book3s_mmu_hpte.c
arch/powerpc/kvm/book3s_pr.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_rmhandlers.S
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/booke.h
arch/powerpc/kvm/booke_interrupts.S
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500_tlb.c
arch/powerpc/kvm/e500_tlb.h
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.c
arch/powerpc/kvm/trace.h
arch/powerpc/mm/hash_native_64.c
arch/powerpc/platforms/iseries/exception.S
arch/powerpc/platforms/iseries/exception.h
arch/powerpc/sysdev/xics/icp-native.c
arch/x86/Kconfig
arch/x86/include/asm/kvm_emulate.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_para.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/processor-flags.h
arch/x86/include/asm/vmx.h
arch/x86/kernel/kvm.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/paravirt.c
arch/x86/kvm/Kconfig
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/mmu_audit.c
arch/x86/kvm/mmutrace.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/trace.h
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm.h
include/linux/kvm_host.h
kernel/compat.c
kernel/delayacct.c
kernel/sched.c
kernel/sched_features.h
virt/kvm/assigned-dev.c
virt/kvm/iommu.c
virt/kvm/kvm_main.c

index aa47be7..40cc653 100644 (file)
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        for all guests.
                        Default is 1 (enabled) if in 64bit or 32bit-PAE mode
 
-       kvm-intel.bypass_guest_pf=
-                       [KVM,Intel] Disables bypassing of guest page faults
-                       on Intel chips. Default is 1 (enabled)
-
        kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                        (virtualized MMU) support on capable Intel chips.
                        Default is 1 (enabled)
@@ -1737,6 +1733,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
                        fault handling.
 
+       no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+                       steal time is computed, but won't influence scheduler
+                       behaviour
+
        nolapic         [X86-32,APIC] Do not enable or use the local APIC.
 
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
index 42542eb..b0e4b9c 100644 (file)
@@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time.
 If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4
 cpus max.
 
+On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
+threads in one or more virtual CPU cores.  (This is because the
+hardware requires all the hardware threads in a CPU core to be in the
+same partition.)  The KVM_CAP_PPC_SMT capability indicates the number
+of vcpus per virtual core (vcore).  The vcore id is obtained by
+dividing the vcpu id by the number of vcpus per vcore.  The vcpus in a
+given vcore will always be in the same physical core as each other
+(though that might be a different physical core from time to time).
+Userspace can control the threading (SMT) mode of the guest by its
+allocation of vcpu ids.  For example, if userspace wants
+single-threaded guest vcpus, it should make all vcpu ids be a multiple
+of the number of vcpus per vcore.
+
 4.8 KVM_GET_DIRTY_LOG (vm ioctl)
 
 Capability: basic
@@ -1143,15 +1156,10 @@ Assigns an IRQ to a passed-through device.
 
 struct kvm_assigned_irq {
        __u32 assigned_dev_id;
-       __u32 host_irq;
+       __u32 host_irq; /* ignored (legacy field) */
        __u32 guest_irq;
        __u32 flags;
        union {
-               struct {
-                       __u32 addr_lo;
-                       __u32 addr_hi;
-                       __u32 data;
-               } guest_msi;
                __u32 reserved[12];
        };
 };
@@ -1239,8 +1247,10 @@ Type: vm ioctl
 Parameters: struct kvm_assigned_msix_nr (in)
 Returns: 0 on success, -1 on error
 
-Set the number of MSI-X interrupts for an assigned device. This service can
-only be called once in the lifetime of an assigned device.
+Set the number of MSI-X interrupts for an assigned device. The number is
+reset again by terminating the MSI-X assignment of the device via
+KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
+point will fail.
 
 struct kvm_assigned_msix_nr {
        __u32 assigned_dev_id;
@@ -1291,6 +1301,135 @@ Returns the tsc frequency of the guest. The unit of the return value is
 KHz. If the host has unstable tsc this ioctl returns -EIO instead as an
 error.
 
+4.56 KVM_GET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (out)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+       char regs[KVM_APIC_REG_SIZE];
+};
+
+Reads the Local APIC registers and copies them into the input argument.  The
+data format and layout are the same as documented in the architecture manual.
+
+4.57 KVM_SET_LAPIC
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_lapic_state (in)
+Returns: 0 on success, -1 on error
+
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+       char regs[KVM_APIC_REG_SIZE];
+};
+
+Copies the input argument into the the Local APIC registers.  The data format
+and layout are the same as documented in the architecture manual.
+
+4.58 KVM_IOEVENTFD
+
+Capability: KVM_CAP_IOEVENTFD
+Architectures: all
+Type: vm ioctl
+Parameters: struct kvm_ioeventfd (in)
+Returns: 0 on success, !0 on error
+
+This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
+within the guest.  A guest write in the registered address will signal the
+provided event instead of triggering an exit.
+
+struct kvm_ioeventfd {
+       __u64 datamatch;
+       __u64 addr;        /* legal pio/mmio address */
+       __u32 len;         /* 1, 2, 4, or 8 bytes    */
+       __s32 fd;
+       __u32 flags;
+       __u8  pad[36];
+};
+
+The following flags are defined:
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+
+If datamatch flag is set, the event will be signaled only if the written value
+to the registered address is equal to datamatch in struct kvm_ioeventfd.
+
+4.62 KVM_CREATE_SPAPR_TCE
+
+Capability: KVM_CAP_SPAPR_TCE
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+       __u64 liobn;
+       __u32 window_size;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table.  The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every 4kiB of the DMA window.
+
+When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE
+table has been created using this ioctl(), the kernel will handle it
+in real mode, updating the TCE table.  H_PUT_TCE calls for other
+liobns will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
+4.63 KVM_ALLOCATE_RMA
+
+Capability: KVM_CAP_PPC_RMA
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_allocate_rma (out)
+Returns: file descriptor for mapping the allocated RMA
+
+This allocates a Real Mode Area (RMA) from the pool allocated at boot
+time by the kernel.  An RMA is a physically-contiguous, aligned region
+of memory used on older POWER processors to provide the memory which
+will be accessed by real-mode (MMU off) accesses in a KVM guest.
+POWER processors support a set of sizes for the RMA that usually
+includes 64MB, 128MB, 256MB and some larger powers of two.
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+       __u64 rma_size;
+};
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the allocated RMA into userspace.  The mapped area can then be
+passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the
+RMA for a virtual machine.  The size of the RMA in bytes (which is
+fixed at host kernel boot time) is returned in the rma_size field of
+the argument structure.
+
+The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl
+is supported; 2 if the processor requires all virtual machines to have
+an RMA, or 1 if the processor can use an RMA but doesn't require it,
+because it supports the Virtual RMA (VRMA) facility.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -1473,6 +1612,23 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+               /* KVM_EXIT_PAPR_HCALL */
+               struct {
+                       __u64 nr;
+                       __u64 ret;
+                       __u64 args[9];
+               } papr_hcall;
+
+This is used on 64-bit PowerPC when emulating a pSeries partition,
+e.g. with the 'pseries' machine type in qemu.  It occurs when the
+guest does a hypercall using the 'sc 1' instruction.  The 'nr' field
+contains the hypercall number (from the guest R3), and 'args' contains
+the arguments (from the guest R4 - R12).  Userspace should put the
+return code in 'ret' and any extra returned values in args[].
+The possible hypercalls are defined in the Power Architecture Platform
+Requirements (PAPR) document available from www.power.org (free
+developer registration required to access it).
+
                /* Fix the size of the union. */
                char padding[256];
        };
index f46aa58..5dc972c 100644 (file)
@@ -165,6 +165,10 @@ Shadow pages contain the following information:
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
+  role.smep_andnot_wp:
+    Contains the value of cr4.smep && !cr0.wp for which the page is valid
+    (pages for which this is true are different from other pages; see the
+    treatment of cr0.wp=0 below).
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
@@ -317,6 +321,20 @@ on fault type:
 
 (user write faults generate a #PF)
 
+In the first case there is an additional complication if CR4.SMEP is
+enabled: since we've turned the page into a kernel page, the kernel may now
+execute it.  We handle this by also setting spte.nx.  If we get a user
+fetch or read fault, we'll change spte.u=1 and spte.nx=gpte.nx back.
+
+To prevent an spte that was converted into a kernel page with cr0.wp=0
+from being written by the kernel after cr0.wp has changed to 1, we make
+the value of cr0.wp part of the page role.  This means that an spte created
+with one value of cr0.wp cannot be used when cr0.wp has a different value -
+it will simply be missed by the shadow page lookup code.  A similar issue
+exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
+changing cr4.smep to 1.  To avoid this, the value of !cr0.wp && cr4.smep
+is also made a part of the page role.
+
 Large pages
 ===========
 
index d079aed..5031780 100644 (file)
@@ -185,3 +185,37 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 
        Currently type 2 APF will be always delivered on the same vcpu as
        type 1 was, but guest should not rely on that.
+
+MSR_KVM_STEAL_TIME: 0x4b564d03
+
+       data: 64-byte alignment physical address of a memory area which must be
+       in guest RAM, plus an enable bit in bit 0. This memory is expected to
+       hold a copy of the following structure:
+
+       struct kvm_steal_time {
+               __u64 steal;
+               __u32 version;
+               __u32 flags;
+               __u32 pad[12];
+       }
+
+       whose data will be filled in by the hypervisor periodically. Only one
+       write, or registration, is needed for each VCPU. The interval between
+       updates of this structure is arbitrary and implementation-dependent.
+       The hypervisor may update this structure at any time it sees fit until
+       anything with bit0 == 0 is written to it. Guest is required to make sure
+       this structure is initialized to zero.
+
+       Fields have the following meanings:
+
+               version: a sequence counter. In other words, guest has to check
+               this field before and after grabbing time information and make
+               sure they are both equal and even. An odd version indicates an
+               in-progress update.
+
+               flags: At this point, always zero. May be used to indicate
+               changes in this structure in the future.
+
+               steal: the amount of time in which this vCPU did not run, in
+               nanoseconds. Time during which the vcpu is idle, will not be
+               reported as steal time.
diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virtual/kvm/nested-vmx.txt
new file mode 100644 (file)
index 0000000..8ed937d
--- /dev/null
@@ -0,0 +1,251 @@
+Nested VMX
+==========
+
+Overview
+---------
+
+On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions)
+to easily and efficiently run guest operating systems. Normally, these guests
+*cannot* themselves be hypervisors running their own guests, because in VMX,
+guests cannot use VMX instructions.
+
+The "Nested VMX" feature adds this missing capability - of running guest
+hypervisors (which use VMX) with their own nested guests. It does so by
+allowing a guest to use VMX instructions, and correctly and efficiently
+emulating them using the single level of VMX available in the hardware.
+
+We describe in much greater detail the theory behind the nested VMX feature,
+its implementation and its performance characteristics, in the OSDI 2010 paper
+"The Turtles Project: Design and Implementation of Nested Virtualization",
+available at:
+
+       http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf
+
+
+Terminology
+-----------
+
+Single-level virtualization has two levels - the host (KVM) and the guests.
+In nested virtualization, we have three levels: The host (KVM), which we call
+L0, the guest hypervisor, which we call L1, and its nested guest, which we
+call L2.
+
+
+Known limitations
+-----------------
+
+The current code supports running Linux guests under KVM guests.
+Only 64-bit guest hypervisors are supported.
+
+Additional patches for running Windows under guest KVM, and Linux under
+guest VMware server, and support for nested EPT, are currently running in
+the lab, and will be sent as follow-on patchsets.
+
+
+Running nested VMX
+------------------
+
+The nested VMX feature is disabled by default. It can be enabled by giving
+the "nested=1" option to the kvm-intel module.
+
+No modifications are required to user space (qemu). However, qemu's default
+emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be
+explicitly enabled, by giving qemu one of the following options:
+
+     -cpu host              (emulated CPU has all features of the real CPU)
+
+     -cpu qemu64,+vmx       (add just the vmx feature to a named CPU type)
+
+
+ABIs
+----
+
+Nested VMX aims to present a standard and (eventually) fully-functional VMX
+implementation for the a guest hypervisor to use. As such, the official
+specification of the ABI that it provides is Intel's VMX specification,
+namely volume 3B of their "Intel 64 and IA-32 Architectures Software
+Developer's Manual". Not all of VMX's features are currently fully supported,
+but the goal is to eventually support them all, starting with the VMX features
+which are used in practice by popular hypervisors (KVM and others).
+
+As a VMX implementation, nested VMX presents a VMCS structure to L1.
+As mandated by the spec, other than the two fields revision_id and abort,
+this structure is *opaque* to its user, who is not supposed to know or care
+about its internal structure. Rather, the structure is accessed through the
+VMREAD and VMWRITE instructions.
+Still, for debugging purposes, KVM developers might be interested to know the
+internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c.
+
+The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we
+also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS
+which L0 builds to actually run L2 - how this is done is explained in the
+aforementioned paper.
+
+For convenience, we repeat the content of struct vmcs12 here. If the internals
+of this structure changes, this can break live migration across KVM versions.
+VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner
+struct shadow_vmcs is ever changed.
+
+       typedef u64 natural_width;
+       struct __packed vmcs12 {
+               /* According to the Intel spec, a VMCS region must start with
+                * these two user-visible fields */
+               u32 revision_id;
+               u32 abort;
+
+               u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+               u32 padding[7]; /* room for future expansion */
+
+               u64 io_bitmap_a;
+               u64 io_bitmap_b;
+               u64 msr_bitmap;
+               u64 vm_exit_msr_store_addr;
+               u64 vm_exit_msr_load_addr;
+               u64 vm_entry_msr_load_addr;
+               u64 tsc_offset;
+               u64 virtual_apic_page_addr;
+               u64 apic_access_addr;
+               u64 ept_pointer;
+               u64 guest_physical_address;
+               u64 vmcs_link_pointer;
+               u64 guest_ia32_debugctl;
+               u64 guest_ia32_pat;
+               u64 guest_ia32_efer;
+               u64 guest_pdptr0;
+               u64 guest_pdptr1;
+               u64 guest_pdptr2;
+               u64 guest_pdptr3;
+               u64 host_ia32_pat;
+               u64 host_ia32_efer;
+               u64 padding64[8]; /* room for future expansion */
+               natural_width cr0_guest_host_mask;
+               natural_width cr4_guest_host_mask;
+               natural_width cr0_read_shadow;
+               natural_width cr4_read_shadow;
+               natural_width cr3_target_value0;
+               natural_width cr3_target_value1;
+               natural_width cr3_target_value2;
+               natural_width cr3_target_value3;
+               natural_width exit_qualification;
+               natural_width guest_linear_address;
+               natural_width guest_cr0;
+               natural_width guest_cr3;
+               natural_width guest_cr4;
+               natural_width guest_es_base;
+               natural_width guest_cs_base;
+               natural_width guest_ss_base;
+               natural_width guest_ds_base;
+               natural_width guest_fs_base;
+               natural_width guest_gs_base;
+               natural_width guest_ldtr_base;
+               natural_width guest_tr_base;
+               natural_width guest_gdtr_base;
+               natural_width guest_idtr_base;
+               natural_width guest_dr7;
+               natural_width guest_rsp;
+               natural_width guest_rip;
+               natural_width guest_rflags;
+               natural_width guest_pending_dbg_exceptions;
+               natural_width guest_sysenter_esp;
+               natural_width guest_sysenter_eip;
+               natural_width host_cr0;
+               natural_width host_cr3;
+               natural_width host_cr4;
+               natural_width host_fs_base;
+               natural_width host_gs_base;
+               natural_width host_tr_base;
+               natural_width host_gdtr_base;
+               natural_width host_idtr_base;
+               natural_width host_ia32_sysenter_esp;
+               natural_width host_ia32_sysenter_eip;
+               natural_width host_rsp;
+               natural_width host_rip;
+               natural_width paddingl[8]; /* room for future expansion */
+               u32 pin_based_vm_exec_control;
+               u32 cpu_based_vm_exec_control;
+               u32 exception_bitmap;
+               u32 page_fault_error_code_mask;
+               u32 page_fault_error_code_match;
+               u32 cr3_target_count;
+               u32 vm_exit_controls;
+               u32 vm_exit_msr_store_count;
+               u32 vm_exit_msr_load_count;
+               u32 vm_entry_controls;
+               u32 vm_entry_msr_load_count;
+               u32 vm_entry_intr_info_field;
+               u32 vm_entry_exception_error_code;
+               u32 vm_entry_instruction_len;
+               u32 tpr_threshold;
+               u32 secondary_vm_exec_control;
+               u32 vm_instruction_error;
+               u32 vm_exit_reason;
+               u32 vm_exit_intr_info;
+               u32 vm_exit_intr_error_code;
+               u32 idt_vectoring_info_field;
+               u32 idt_vectoring_error_code;
+               u32 vm_exit_instruction_len;
+               u32 vmx_instruction_info;
+               u32 guest_es_limit;
+               u32 guest_cs_limit;
+               u32 guest_ss_limit;
+               u32 guest_ds_limit;
+               u32 guest_fs_limit;
+               u32 guest_gs_limit;
+               u32 guest_ldtr_limit;
+               u32 guest_tr_limit;
+               u32 guest_gdtr_limit;
+               u32 guest_idtr_limit;
+               u32 guest_es_ar_bytes;
+               u32 guest_cs_ar_bytes;
+               u32 guest_ss_ar_bytes;
+               u32 guest_ds_ar_bytes;
+               u32 guest_fs_ar_bytes;
+               u32 guest_gs_ar_bytes;
+               u32 guest_ldtr_ar_bytes;
+               u32 guest_tr_ar_bytes;
+               u32 guest_interruptibility_info;
+               u32 guest_activity_state;
+               u32 guest_sysenter_cs;
+               u32 host_ia32_sysenter_cs;
+               u32 padding32[8]; /* room for future expansion */
+               u16 virtual_processor_id;
+               u16 guest_es_selector;
+               u16 guest_cs_selector;
+               u16 guest_ss_selector;
+               u16 guest_ds_selector;
+               u16 guest_fs_selector;
+               u16 guest_gs_selector;
+               u16 guest_ldtr_selector;
+               u16 guest_tr_selector;
+               u16 host_es_selector;
+               u16 host_cs_selector;
+               u16 host_ss_selector;
+               u16 host_ds_selector;
+               u16 host_fs_selector;
+               u16 host_gs_selector;
+               u16 host_tr_selector;
+       };
+
+
+Authors
+-------
+
+These patches were written by:
+     Abel Gordon, abelg <at> il.ibm.com
+     Nadav Har'El, nyh <at> il.ibm.com
+     Orit Wasserman, oritw <at> il.ibm.com
+     Ben-Ami Yassor, benami <at> il.ibm.com
+     Muli Ben-Yehuda, muli <at> il.ibm.com
+
+With contributions by:
+     Anthony Liguori, aliguori <at> us.ibm.com
+     Mike Day, mdday <at> us.ibm.com
+     Michael Factor, factor <at> il.ibm.com
+     Zvi Dubitzky, dubi <at> il.ibm.com
+
+And valuable reviews by:
+     Avi Kivity, avi <at> redhat.com
+     Gleb Natapov, gleb <at> redhat.com
+     Marcelo Tosatti, mtosatti <at> redhat.com
+     Kevin Tian, kevin.tian <at> intel.com
+     and others.
index 3ab969c..2b7ce19 100644 (file)
@@ -68,9 +68,11 @@ page that contains parts of supervisor visible register state. The guest can
 map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
 
 With this hypercall issued the guest always gets the magic page mapped at the
-desired location in effective and physical address space. For now, we always
-map the page to -4096. This way we can access it using absolute load and store
-functions. The following instruction reads the first field of the magic page:
+desired location. The first parameter indicates the effective address when the
+MMU is enabled. The second parameter indicates the address in real mode, if
+applicable to the target. For now, we always map the page to -4096. This way we
+can access it using absolute load and store functions. The following
+instruction reads the first field of the magic page:
 
        ld      rX, -4096(0)
 
index 2eb0a98..32551d3 100644 (file)
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu)
                pv_time_ops.init_missing_ticks_accounting(cpu);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
 static inline int
 paravirt_do_steal_accounting(unsigned long *new_itm)
 {
index a21d7bb..1008682 100644 (file)
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
  * pv_time_ops
  * time operations
  */
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
 
 static int
 ia64_native_do_steal_accounting(unsigned long *new_itm)
index c0d842c..e30442c 100644 (file)
@@ -179,8 +179,9 @@ extern const char *powerpc_base_platform;
 #define LONG_ASM_CONST(x)              0
 #endif
 
-
-#define CPU_FTR_HVMODE_206             LONG_ASM_CONST(0x0000000800000000)
+#define CPU_FTR_HVMODE                 LONG_ASM_CONST(0x0000000200000000)
+#define CPU_FTR_ARCH_201               LONG_ASM_CONST(0x0000000400000000)
+#define CPU_FTR_ARCH_206               LONG_ASM_CONST(0x0000000800000000)
 #define CPU_FTR_CFAR                   LONG_ASM_CONST(0x0000001000000000)
 #define CPU_FTR_IABR                   LONG_ASM_CONST(0x0000002000000000)
 #define CPU_FTR_MMCRA                  LONG_ASM_CONST(0x0000004000000000)
@@ -401,9 +402,10 @@ extern const char *powerpc_base_platform;
            CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \
            CPU_FTR_STCX_CHECKS_ADDRESS)
 #define CPU_FTRS_PPC970        (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
+           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \
            CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
-           CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS)
+           CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \
+           CPU_FTR_HVMODE)
 #define CPU_FTRS_POWER5        (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -417,13 +419,13 @@ extern const char *powerpc_base_platform;
            CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
            CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
-           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\
+           CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
            CPU_FTR_MMCRA | CPU_FTR_SMT | \
            CPU_FTR_COHERENT_ICACHE | \
            CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
            CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
            CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-           CPU_FTR_ICSWX | CPU_FTR_CFAR)
+           CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE)
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
            CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
            CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
index f5dfe34..8057f4f 100644 (file)
 #define EXC_HV H
 #define EXC_STD
 
-#define EXCEPTION_PROLOG_1(area)                                       \
+#define __EXCEPTION_PROLOG_1(area, extra, vec)                         \
        GET_PACA(r13);                                                  \
        std     r9,area+EX_R9(r13);     /* save r9 - r12 */             \
        std     r10,area+EX_R10(r13);                                   \
-       std     r11,area+EX_R11(r13);                                   \
-       std     r12,area+EX_R12(r13);                                   \
        BEGIN_FTR_SECTION_NESTED(66);                                   \
        mfspr   r10,SPRN_CFAR;                                          \
        std     r10,area+EX_CFAR(r13);                                  \
        END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);         \
-       GET_SCRATCH0(r9);                                               \
-       std     r9,area+EX_R13(r13);                                    \
-       mfcr    r9
+       mfcr    r9;                                                     \
+       extra(vec);                                                     \
+       std     r11,area+EX_R11(r13);                                   \
+       std     r12,area+EX_R12(r13);                                   \
+       GET_SCRATCH0(r10);                                              \
+       std     r10,area+EX_R13(r13)
+#define EXCEPTION_PROLOG_1(area, extra, vec)                           \
+       __EXCEPTION_PROLOG_1(area, extra, vec)
 
 #define __EXCEPTION_PROLOG_PSERIES_1(label, h)                         \
        ld      r12,PACAKBASE(r13);     /* get high part of &label */   \
        mtspr   SPRN_##h##SRR1,r10;                                     \
        h##rfid;                                                        \
        b       .       /* prevent speculative execution */
-#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
+#define EXCEPTION_PROLOG_PSERIES_1(label, h)                           \
        __EXCEPTION_PROLOG_PSERIES_1(label, h)
 
-#define EXCEPTION_PROLOG_PSERIES(area, label, h)                       \
-       EXCEPTION_PROLOG_1(area);                                       \
+#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)           \
+       EXCEPTION_PROLOG_1(area, extra, vec);                           \
        EXCEPTION_PROLOG_PSERIES_1(label, h);
 
+#define __KVMTEST(n)                                                   \
+       lbz     r10,HSTATE_IN_GUEST(r13);                       \
+       cmpwi   r10,0;                                                  \
+       bne     do_kvm_##n
+
+#define __KVM_HANDLER(area, h, n)                                      \
+do_kvm_##n:                                                            \
+       ld      r10,area+EX_R10(r13);                                   \
+       stw     r9,HSTATE_SCRATCH1(r13);                        \
+       ld      r9,area+EX_R9(r13);                                     \
+       std     r12,HSTATE_SCRATCH0(r13);                       \
+       li      r12,n;                                                  \
+       b       kvmppc_interrupt
+
+#define __KVM_HANDLER_SKIP(area, h, n)                                 \
+do_kvm_##n:                                                            \
+       cmpwi   r10,KVM_GUEST_MODE_SKIP;                                \
+       ld      r10,area+EX_R10(r13);                                   \
+       beq     89f;                                                    \
+       stw     r9,HSTATE_SCRATCH1(r13);                        \
+       ld      r9,area+EX_R9(r13);                                     \
+       std     r12,HSTATE_SCRATCH0(r13);                       \
+       li      r12,n;                                                  \
+       b       kvmppc_interrupt;                                       \
+89:    mtocrf  0x80,r9;                                                \
+       ld      r9,area+EX_R9(r13);                                     \
+       b       kvmppc_skip_##h##interrupt
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define KVMTEST(n)                     __KVMTEST(n)
+#define KVM_HANDLER(area, h, n)                __KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)   __KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST(n)
+#define KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+#define KVMTEST_PR(n)                  __KVMTEST(n)
+#define KVM_HANDLER_PR(area, h, n)     __KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)        __KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_PR(n)
+#define KVM_HANDLER_PR(area, h, n)
+#define KVM_HANDLER_PR_SKIP(area, h, n)
+#endif
+
+#define NOTEST(n)
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
        .globl label##_pSeries;                         \
 label##_pSeries:                                       \
        HMT_MEDIUM;                                     \
-       DO_KVM  vec;                                    \
        SET_SCRATCH0(r13);              /* save r13 */          \
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,    \
+                                EXC_STD, KVMTEST_PR, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)              \
        . = loc;                                        \
        .globl label##_hv;                              \
 label##_hv:                                            \
        HMT_MEDIUM;                                     \
-       DO_KVM  vec;                                    \
-       SET_SCRATCH0(r13);      /* save r13 */          \
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV)
+       SET_SCRATCH0(r13);      /* save r13 */                  \
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,    \
+                                EXC_HV, KVMTEST, vec)
 
-#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h)                    \
-       HMT_MEDIUM;                                                     \
-       DO_KVM  vec;                                                    \
-       SET_SCRATCH0(r13);    /* save r13 */                            \
-       GET_PACA(r13);                                                  \
-       std     r9,PACA_EXGEN+EX_R9(r13);       /* save r9, r10 */      \
-       std     r10,PACA_EXGEN+EX_R10(r13);                             \
+#define __SOFTEN_TEST(h)                                               \
        lbz     r10,PACASOFTIRQEN(r13);                                 \
-       mfcr    r9;                                                     \
        cmpwi   r10,0;                                                  \
-       beq     masked_##h##interrupt;                                  \
-       GET_SCRATCH0(r10);                                              \
-       std     r10,PACA_EXGEN+EX_R13(r13);                             \
-       std     r11,PACA_EXGEN+EX_R11(r13);                             \
-       std     r12,PACA_EXGEN+EX_R12(r13);                             \
-       ld      r12,PACAKBASE(r13);     /* get high part of &label */   \
-       ld      r10,PACAKMSR(r13);      /* get MSR value for kernel */  \
-       mfspr   r11,SPRN_##h##SRR0;     /* save SRR0 */                 \
-       LOAD_HANDLER(r12,label##_common)                                \
-       mtspr   SPRN_##h##SRR0,r12;                                     \
-       mfspr   r12,SPRN_##h##SRR1;     /* and SRR1 */                  \
-       mtspr   SPRN_##h##SRR1,r10;                                     \
-       h##rfid;                                                        \
-       b       .       /* prevent speculative execution */
-#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h)                     \
-       __MASKABLE_EXCEPTION_PSERIES(vec, label, h)
+       beq     masked_##h##interrupt
+#define _SOFTEN_TEST(h)        __SOFTEN_TEST(h)
+
+#define SOFTEN_TEST_PR(vec)                                            \
+       KVMTEST_PR(vec);                                                \
+       _SOFTEN_TEST(EXC_STD)
+
+#define SOFTEN_TEST_HV(vec)                                            \
+       KVMTEST(vec);                                                   \
+       _SOFTEN_TEST(EXC_HV)
+
+#define SOFTEN_TEST_HV_201(vec)                                                \
+       KVMTEST(vec);                                                   \
+       _SOFTEN_TEST(EXC_STD)
+
+#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)             \
+       HMT_MEDIUM;                                                     \
+       SET_SCRATCH0(r13);    /* save r13 */                            \
+       __EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);           \
+       EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
+#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)              \
+       __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)                    \
        . = loc;                                                        \
        .globl label##_pSeries;                                         \
 label##_pSeries:                                                       \
-       _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD)
+       _MASKABLE_EXCEPTION_PSERIES(vec, label,                         \
+                                   EXC_STD, SOFTEN_TEST_PR)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)                         \
        . = loc;                                                        \
        .globl label##_hv;                                              \
 label##_hv:                                                            \
-       _MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV)
+       _MASKABLE_EXCEPTION_PSERIES(vec, label,                         \
+                                   EXC_HV, SOFTEN_TEST_HV)
 
 #ifdef CONFIG_PPC_ISERIES
 #define DISABLE_INTS                           \
index fd8201d..1c324ff 100644 (file)
 #define H_LONG_BUSY_ORDER_100_SEC      9905  /* Long busy, hint that 100sec \
                                                 is a good time to retry */
 #define H_LONG_BUSY_END_RANGE          9905  /* End of long busy range */
+
+/* Internal value used in book3s_hv kvm support; not returned to guests */
+#define H_TOO_HARD     9999
+
 #define H_HARDWARE     -1      /* Hardware error */
 #define H_FUNCTION     -2      /* Function not supported */
 #define H_PRIVILEGE    -3      /* Caller not privileged */
 #define H_PAGE_SET_ACTIVE      H_PAGE_STATE_CHANGE
 #define H_AVPN                 (1UL<<(63-32))  /* An avpn is provided as a sanity test */
 #define H_ANDCOND              (1UL<<(63-33))
+#define H_LOCAL                        (1UL<<(63-35))
 #define H_ICACHE_INVALIDATE    (1UL<<(63-40))  /* icbi, etc.  (ignored for IO pages) */
 #define H_ICACHE_SYNCHRONIZE   (1UL<<(63-41))  /* dcbst, icbi, etc (ignored for IO pages */
 #define H_COALESCE_CAND        (1UL<<(63-42))  /* page is a good candidate for coalescing */
index d2ca5ed..a4f6c85 100644 (file)
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
+
 struct kvm_regs {
        __u64 pc;
        __u64 cr;
@@ -272,4 +276,15 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET    -2U
 #define KVM_INTERRUPT_SET_LEVEL        -3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+       __u64 liobn;
+       __u32 window_size;
+};
+
+/* for KVM_ALLOCATE_RMA */
+struct kvm_allocate_rma {
+       __u64 rma_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
index 0951b17..7b1f0e0 100644 (file)
 #define BOOK3S_INTERRUPT_PROGRAM       0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL    0x800
 #define BOOK3S_INTERRUPT_DECREMENTER   0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER        0x980
 #define BOOK3S_INTERRUPT_SYSCALL       0xc00
 #define BOOK3S_INTERRUPT_TRACE         0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE        0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE        0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40
 #define BOOK3S_INTERRUPT_PERFMON       0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC       0xf20
 #define BOOK3S_INTERRUPT_VSX           0xf40
index d62e703..98da010 100644 (file)
 #include <linux/kvm_host.h>
 #include <asm/kvm_book3s_asm.h>
 
-struct kvmppc_slb {
-       u64 esid;
-       u64 vsid;
-       u64 orige;
-       u64 origv;
-       bool valid      : 1;
-       bool Ks         : 1;
-       bool Kp         : 1;
-       bool nx         : 1;
-       bool large      : 1;    /* PTEs are 16MB */
-       bool tb         : 1;    /* 1TB segment */
-       bool class      : 1;
-};
-
 struct kvmppc_bat {
        u64 raw;
        u32 bepi;
@@ -67,11 +53,22 @@ struct kvmppc_sid_map {
 #define VSID_POOL_SIZE (SID_CONTEXTS * 16)
 #endif
 
+struct hpte_cache {
+       struct hlist_node list_pte;
+       struct hlist_node list_pte_long;
+       struct hlist_node list_vpte;
+       struct hlist_node list_vpte_long;
+       struct rcu_head rcu_head;
+       u64 host_va;
+       u64 pfn;
+       ulong slot;
+       struct kvmppc_pte pte;
+};
+
 struct kvmppc_vcpu_book3s {
        struct kvm_vcpu vcpu;
        struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
        struct kvmppc_sid_map sid_map[SID_MAP_NUM];
-       struct kvmppc_slb slb[64];
        struct {
                u64 esid;
                u64 vsid;
@@ -81,7 +78,6 @@ struct kvmppc_vcpu_book3s {
        struct kvmppc_bat dbat[8];
        u64 hid[6];
        u64 gqr[8];
-       int slb_nr;
        u64 sdr1;
        u64 hior;
        u64 msr_mask;
@@ -93,7 +89,13 @@ struct kvmppc_vcpu_book3s {
        u64 vsid_max;
 #endif
        int context_id[SID_CONTEXTS];
-       ulong prog_flags; /* flags to inject when giving a 700 trap */
+
+       struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+       struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
+       struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+       struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+       int hpte_cache_count;
+       spinlock_t mmu_lock;
 };
 
 #define CONTEXT_HOST           0
@@ -110,8 +112,10 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
 extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -123,19 +127,22 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
-extern ulong kvmppc_trampoline_lowmem;
-extern ulong kvmppc_trampoline_enter;
+extern void kvmppc_handler_lowmem_trampoline(void);
+extern void kvmppc_handler_trampoline_enter(void);
 extern void kvmppc_rmcall(ulong srr0, ulong srr1);
+extern void kvmppc_hv_entry_trampoline(void);
 extern void kvmppc_load_up_fpu(void);
 extern void kvmppc_load_up_altivec(void);
 extern void kvmppc_load_up_vsx(void);
@@ -147,15 +154,32 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
-static inline ulong dsisr(void)
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
-       ulong r;
-       asm ( "mfdsisr %0 " : "=r" (r) );
-       return r;
+       return to_book3s(vcpu)->hior;
 }
 
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+                       unsigned long pending_now, unsigned long old_pending)
+{
+       if (pending_now)
+               vcpu->arch.shared->int_pending = 1;
+       else if (old_pending)
+               vcpu->arch.shared->int_pending = 0;
+}
 
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
@@ -244,6 +268,120 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return to_svcpu(vcpu)->fault_dar;
 }
 
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+       ulong crit_raw = vcpu->arch.shared->critical;
+       ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+       bool crit;
+
+       /* Truncate crit indicators in 32 bit mode */
+       if (!(vcpu->arch.shared->msr & MSR_SF)) {
+               crit_raw &= 0xffffffff;
+               crit_r1 &= 0xffffffff;
+       }
+
+       /* Critical section when crit == r1 */
+       crit = (crit_raw == crit_r1);
+       /* ... and we're in supervisor mode */
+       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+       return crit;
+}
+#else /* CONFIG_KVM_BOOK3S_PR */
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+                       unsigned long pending_now, unsigned long old_pending)
+{
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+       vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+       return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+       vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+       vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+       vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+       ulong pc = kvmppc_get_pc(vcpu);
+
+       /* Load the instruction manually if it failed to do so in the
+        * exit path */
+       if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+               kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+       return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.fault_dar;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+       return false;
+}
+#endif
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3                        0x113724FA
@@ -251,12 +389,4 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ                       0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
index 4cadd61..e43fe42 100644 (file)
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#ifdef CONFIG_KVM_BOOK3S_PR
 static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 {
        return &get_paca()->shadow_vcpu;
 }
+#endif
+
+#define SPAPR_TCE_SHIFT                12
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index d5a8a38..ef7b368 100644 (file)
@@ -60,6 +60,36 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+/*
+ * This struct goes in the PACA on 64-bit processors.  It is used
+ * to store host state that needs to be saved when we enter a guest
+ * and restored when we exit, but isn't specific to any particular
+ * guest or vcpu.  It also has some scratch fields used by the guest
+ * exit code.
+ */
+struct kvmppc_host_state {
+       ulong host_r1;
+       ulong host_r2;
+       ulong host_msr;
+       ulong vmhandler;
+       ulong scratch0;
+       ulong scratch1;
+       u8 in_guest;
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       struct kvm_vcpu *kvm_vcpu;
+       struct kvmppc_vcore *kvm_vcore;
+       unsigned long xics_phys;
+       u64 dabr;
+       u64 host_mmcr[3];
+       u32 host_pmc[8];
+       u64 host_purr;
+       u64 host_spurr;
+       u64 host_dscr;
+       u64 dec_expires;
+#endif
+};
+
 struct kvmppc_book3s_shadow_vcpu {
        ulong gpr[14];
        u32 cr;
@@ -73,17 +103,12 @@ struct kvmppc_book3s_shadow_vcpu {
        ulong shadow_srr1;
        ulong fault_dar;
 
-       ulong host_r1;
-       ulong host_r2;
-       ulong handler;
-       ulong scratch0;
-       ulong scratch1;
-       ulong vmhandler;
-       u8 in_guest;
-
 #ifdef CONFIG_PPC_BOOK3S_32
        u32     sr[16];                 /* Guest SRs */
+
+       struct kvmppc_host_state hstate;
 #endif
+
 #ifdef CONFIG_PPC_BOOK3S_64
        u8 slb_max;                     /* highest used guest slb entry */
        struct  {
index 9c9ba3d..a90e091 100644 (file)
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
        return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
index 7a2a565..adbfca9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, <yu.liu@freescale.com>
  *
@@ -29,17 +29,25 @@ struct tlbe{
        u32 mas7;
 };
 
+#define E500_TLB_VALID 1
+#define E500_TLB_DIRTY 2
+
+struct tlbe_priv {
+       pfn_t pfn;
+       unsigned int flags; /* E500_TLB_* */
+};
+
+struct vcpu_id_table;
+
 struct kvmppc_vcpu_e500 {
        /* Unmodified copy of the guest's TLB. */
-       struct tlbe *guest_tlb[E500_TLB_NUM];
-       /* TLB that's actually used when the guest is running. */
-       struct tlbe *shadow_tlb[E500_TLB_NUM];
-       /* Pages which are referenced in the shadow TLB. */
-       struct page **shadow_pages[E500_TLB_NUM];
+       struct tlbe *gtlb_arch[E500_TLB_NUM];
 
-       unsigned int guest_tlb_size[E500_TLB_NUM];
-       unsigned int shadow_tlb_size[E500_TLB_NUM];
-       unsigned int guest_tlb_nv[E500_TLB_NUM];
+       /* KVM internal information associated with each guest TLB entry */
+       struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+       unsigned int gtlb_size[E500_TLB_NUM];
+       unsigned int gtlb_nv[E500_TLB_NUM];
 
        u32 host_pid[E500_PID_NUM];
        u32 pid[E500_PID_NUM];
@@ -53,6 +61,10 @@ struct kvmppc_vcpu_e500 {
        u32 mas5;
        u32 mas6;
        u32 mas7;
+
+       /* vcpu id table */
+       struct vcpu_id_table *idt;
+
        u32 l1csr0;
        u32 l1csr1;
        u32 hid0;
index 186f150..cc22b28 100644 (file)
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS          NR_CPUS
+#define KVM_MAX_VCORES         NR_CPUS
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x) 0
@@ -57,6 +65,10 @@ struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
 
+struct lppaca;
+struct slb_shadow;
+struct dtl;
+
 struct kvm_vm_stat {
        u32 remote_tlb_flush;
 };
@@ -133,9 +145,74 @@ struct kvmppc_exit_timing {
        };
 };
 
+struct kvmppc_pginfo {
+       unsigned long pfn;
+       atomic_t refcnt;
+};
+
+struct kvmppc_spapr_tce_table {
+       struct list_head list;
+       struct kvm *kvm;
+       u64 liobn;
+       u32 window_size;
+       struct page *pages[0];
+};
+
+struct kvmppc_rma_info {
+       void            *base_virt;
+       unsigned long    base_pfn;
+       unsigned long    npages;
+       struct list_head list;
+       atomic_t         use_count;
+};
+
 struct kvm_arch {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       unsigned long hpt_virt;
+       unsigned long ram_npages;
+       unsigned long ram_psize;
+       unsigned long ram_porder;
+       struct kvmppc_pginfo *ram_pginfo;
+       unsigned int lpid;
+       unsigned int host_lpid;
+       unsigned long host_lpcr;
+       unsigned long sdr1;
+       unsigned long host_sdr1;
+       int tlbie_lock;
+       int n_rma_pages;
+       unsigned long lpcr;
+       unsigned long rmor;
+       struct kvmppc_rma_info *rma;
+       struct list_head spapr_tce_tables;
+       unsigned short last_vcpu[NR_CPUS];
+       struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+       int n_runnable;
+       int n_blocked;
+       int num_threads;
+       int entry_exit_count;
+       int n_woken;
+       int nap_count;
+       u16 pcpu;
+       u8 vcore_running;
+       u8 in_guest;
+       struct list_head runnable_threads;
+       spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)  ((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)   ((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
        ulong eaddr;
        u64 vpage;
@@ -163,16 +240,18 @@ struct kvmppc_mmu {
        bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
 
-struct hpte_cache {
-       struct hlist_node list_pte;
-       struct hlist_node list_pte_long;
-       struct hlist_node list_vpte;
-       struct hlist_node list_vpte_long;
-       struct rcu_head rcu_head;
-       u64 host_va;
-       u64 pfn;
-       ulong slot;
-       struct kvmppc_pte pte;
+struct kvmppc_slb {
+       u64 esid;
+       u64 vsid;
+       u64 orige;
+       u64 origv;
+       bool valid      : 1;
+       bool Ks         : 1;
+       bool Kp         : 1;
+       bool nx         : 1;
+       bool large      : 1;    /* PTEs are 16MB */
+       bool tb         : 1;    /* 1TB segment */
+       bool class      : 1;
 };
 
 struct kvm_vcpu_arch {
@@ -187,6 +266,9 @@ struct kvm_vcpu_arch {
        ulong highmem_handler;
        ulong rmcall;
        ulong host_paca_phys;
+       struct kvmppc_slb slb[64];
+       int slb_max;            /* 1 + index of last valid entry in slb[] */
+       int slb_nr;             /* total number of entries in SLB */
        struct kvmppc_mmu mmu;
 #endif
 
@@ -195,13 +277,19 @@ struct kvm_vcpu_arch {
        u64 fpr[32];
        u64 fpscr;
 
+#ifdef CONFIG_SPE
+       ulong evr[32];
+       ulong spefscr;
+       ulong host_spefscr;
+       u64 acc;
+#endif
 #ifdef CONFIG_ALTIVEC
        vector128 vr[32];
        vector128 vscr;
 #endif
 
 #ifdef CONFIG_VSX
-       u64 vsr[32];
+       u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -209,22 +297,27 @@ struct kvm_vcpu_arch {
        u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
        ulong pc;
        ulong ctr;
        ulong lr;
 
        ulong xer;
        u32 cr;
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S
-       ulong shadow_msr;
        ulong hflags;
        ulong guest_owned_ext;
+       ulong purr;
+       ulong spurr;
+       ulong dscr;
+       ulong amr;
+       ulong uamor;
+       u32 ctrl;
+       ulong dabr;
 #endif
        u32 vrsave; /* also USPRG0 */
        u32 mmucr;
+       ulong shadow_msr;
        ulong sprg4;
        ulong sprg5;
        ulong sprg6;
@@ -249,6 +342,7 @@ struct kvm_vcpu_arch {
        u32 pvr;
 
        u32 shadow_pid;
+       u32 shadow_pid1;
        u32 pid;
        u32 swap_pid;
 
@@ -258,6 +352,9 @@ struct kvm_vcpu_arch {
        u32 dbcr1;
        u32 dbsr;
 
+       u64 mmcr[3];
+       u32 pmc[8];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
        struct mutex exit_timing_lock;
        struct kvmppc_exit_timing timing_exit;
@@ -272,8 +369,12 @@ struct kvm_vcpu_arch {
        struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+       ulong fault_dar;
+       u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-       u32 last_inst;
        ulong fault_dear;
        ulong fault_esr;
        ulong queued_dear;
@@ -288,25 +389,47 @@ struct kvm_vcpu_arch {
        u8 dcr_is_write;
        u8 osi_needed;
        u8 osi_enabled;
+       u8 hcall_needed;
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
        struct hrtimer dec_timer;
        struct tasklet_struct tasklet;
        u64 dec_jiffies;
+       u64 dec_expires;
        unsigned long pending_exceptions;
+       u16 last_cpu;
+       u8 ceded;
+       u8 prodded;
+       u32 last_inst;
+
+       struct lppaca *vpa;
+       struct slb_shadow *slb_shadow;
+       struct dtl *dtl;
+       struct dtl *dtl_end;
+
+       struct kvmppc_vcore *vcore;
+       int ret;
+       int trap;
+       int state;
+       int ptid;
+       wait_queue_head_t cpu_run;
+
        struct kvm_vcpu_arch_shared *shared;
        unsigned long magic_page_pa; /* phys addr to map the magic page to */
        unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
-#ifdef CONFIG_PPC_BOOK3S
-       struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
-       struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
-       struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
-       struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
-       int hpte_cache_count;
-       spinlock_t mmu_lock;
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       struct kvm_vcpu_arch_shared shregs;
+
+       struct list_head run_list;
+       struct task_struct *run_task;
+       struct kvm_run *kvm_run;
 #endif
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST       0
+#define KVMPPC_VCPU_BLOCKED            1
+#define KVMPPC_VCPU_RUNNABLE           2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
index 9345238..d121f49 100644 (file)
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
@@ -42,6 +45,7 @@ enum emulation_result {
        EMULATE_AGAIN,        /* something went wrong. go again */
 };
 
+extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
@@ -109,6 +113,27 @@ extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
+extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
+
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+                           struct kvm_userspace_memory_region *mem);
+extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+                               struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
+                               struct kvm_allocate_rma *rma);
+extern struct kvmppc_rma_info *kvm_alloc_rma(void);
+extern void kvm_release_rma(struct kvmppc_rma_info *ri);
+extern int kvmppc_core_init_vm(struct kvm *kvm);
+extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
+extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem);
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -151,4 +176,20 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+       paca[cpu].kvm_hstate.xics_phys = addr;
+}
+
+extern void kvm_rma_init(void);
+
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+
+static inline void kvm_rma_init(void)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
index d865bd9..b445e0a 100644 (file)
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0             ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS              ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI          ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT       12
-#define HPTE_R_RPN             ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS           ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN             ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP              ASM_CONST(0x0000000000000003)
 #define HPTE_R_N               ASM_CONST(0x0000000000000004)
+#define HPTE_R_G               ASM_CONST(0x0000000000000008)
+#define HPTE_R_M               ASM_CONST(0x0000000000000010)
+#define HPTE_R_I               ASM_CONST(0x0000000000000020)
+#define HPTE_R_W               ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG            ASM_CONST(0x0000000000000078)
 #define HPTE_R_C               ASM_CONST(0x0000000000000080)
 #define HPTE_R_R               ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO          ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG         ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK       ASM_CONST(0x4001ffffff000000)
index 7412676..a6da128 100644 (file)
@@ -147,9 +147,12 @@ struct paca_struct {
        struct dtl_entry *dtl_curr;     /* pointer corresponding to dtl_ridx */
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
        /* We use this to store guest state in */
        struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
 #endif
+       struct kvmppc_host_state kvm_hstate;
+#endif
 };
 
 extern struct paca_struct *paca;
index 1b42238..368f72f 100644 (file)
@@ -150,18 +150,22 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define REST_16VSRSU(n,b,base) REST_8VSRSU(n,b,base); REST_8VSRSU(n+8,b,base)
 #define REST_32VSRSU(n,b,base) REST_16VSRSU(n,b,base); REST_16VSRSU(n+16,b,base)
 
-#define SAVE_EVR(n,s,base)     evmergehi s,s,n; stw s,THREAD_EVR0+4*(n)(base)
-#define SAVE_2EVRS(n,s,base)   SAVE_EVR(n,s,base); SAVE_EVR(n+1,s,base)
-#define SAVE_4EVRS(n,s,base)   SAVE_2EVRS(n,s,base); SAVE_2EVRS(n+2,s,base)
-#define SAVE_8EVRS(n,s,base)   SAVE_4EVRS(n,s,base); SAVE_4EVRS(n+4,s,base)
-#define SAVE_16EVRS(n,s,base)  SAVE_8EVRS(n,s,base); SAVE_8EVRS(n+8,s,base)
-#define SAVE_32EVRS(n,s,base)  SAVE_16EVRS(n,s,base); SAVE_16EVRS(n+16,s,base)
-#define REST_EVR(n,s,base)     lwz s,THREAD_EVR0+4*(n)(base); evmergelo n,s,n
-#define REST_2EVRS(n,s,base)   REST_EVR(n,s,base); REST_EVR(n+1,s,base)
-#define REST_4EVRS(n,s,base)   REST_2EVRS(n,s,base); REST_2EVRS(n+2,s,base)
-#define REST_8EVRS(n,s,base)   REST_4EVRS(n,s,base); REST_4EVRS(n+4,s,base)
-#define REST_16EVRS(n,s,base)  REST_8EVRS(n,s,base); REST_8EVRS(n+8,s,base)
-#define REST_32EVRS(n,s,base)  REST_16EVRS(n,s,base); REST_16EVRS(n+16,s,base)
+/*
+ * b = base register for addressing, o = base offset from register of 1st EVR
+ * n = first EVR, s = scratch
+ */
+#define SAVE_EVR(n,s,b,o)      evmergehi s,s,n; stw s,o+4*(n)(b)
+#define SAVE_2EVRS(n,s,b,o)    SAVE_EVR(n,s,b,o); SAVE_EVR(n+1,s,b,o)
+#define SAVE_4EVRS(n,s,b,o)    SAVE_2EVRS(n,s,b,o); SAVE_2EVRS(n+2,s,b,o)
+#define SAVE_8EVRS(n,s,b,o)    SAVE_4EVRS(n,s,b,o); SAVE_4EVRS(n+4,s,b,o)
+#define SAVE_16EVRS(n,s,b,o)   SAVE_8EVRS(n,s,b,o); SAVE_8EVRS(n+8,s,b,o)
+#define SAVE_32EVRS(n,s,b,o)   SAVE_16EVRS(n,s,b,o); SAVE_16EVRS(n+16,s,b,o)
+#define REST_EVR(n,s,b,o)      lwz s,o+4*(n)(b); evmergelo n,s,n
+#define REST_2EVRS(n,s,b,o)    REST_EVR(n,s,b,o); REST_EVR(n+1,s,b,o)
+#define REST_4EVRS(n,s,b,o)    REST_2EVRS(n,s,b,o); REST_2EVRS(n+2,s,b,o)
+#define REST_8EVRS(n,s,b,o)    REST_4EVRS(n,s,b,o); REST_4EVRS(n+4,s,b,o)
+#define REST_16EVRS(n,s,b,o)   REST_8EVRS(n,s,b,o); REST_8EVRS(n+8,s,b,o)
+#define REST_32EVRS(n,s,b,o)   REST_16EVRS(n,s,b,o); REST_16EVRS(n+16,s,b,o)
 
 /* Macros to adjust thread priority for hardware multithreading */
 #define HMT_VERY_LOW   or      31,31,31        # very low priority
index c5cae0d..ddbe57a 100644 (file)
 #define SPRN_CTR       0x009   /* Count Register */
 #define SPRN_DSCR      0x11
 #define SPRN_CFAR      0x1c    /* Come From Address Register */
+#define SPRN_AMR       0x1d    /* Authority Mask Register */
+#define SPRN_UAMOR     0x9d    /* User Authority Mask Override Register */
+#define SPRN_AMOR      0x15d   /* Authority Mask Override Register */
 #define SPRN_ACOP      0x1F    /* Available Coprocessor Register */
 #define SPRN_CTRLF     0x088
 #define SPRN_CTRLT     0x098
 #define   LPCR_VPM0    (1ul << (63-0))
 #define   LPCR_VPM1    (1ul << (63-1))
 #define   LPCR_ISL     (1ul << (63-2))
+#define   LPCR_VC_SH   (63-2)
 #define   LPCR_DPFD_SH (63-11)
 #define   LPCR_VRMA_L  (1ul << (63-12))
 #define   LPCR_VRMA_LP0        (1ul << (63-15))
 #define   LPCR_VRMA_LP1        (1ul << (63-16))
+#define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
+#define          LPCR_RMLS_SH  (63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE    0x00007000      /* powersave exit cause enable */
 #define     LPCR_PECE0 0x00004000      /* ext. exceptions can cause exit */
 #define     LPCR_PECE1 0x00002000      /* decrementer can cause exit */
 #define     LPCR_PECE2 0x00001000      /* machine check etc can cause exit */
 #define   LPCR_MER     0x00000800      /* Mediated External Exception */
+#define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
+#define   LPCR_LPES_SH 2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID      0x13F   /* Logical Partition Identifier */
+#define   LPID_RSVD    0x3ff           /* Reserved LPID for partn switching */
 #define        SPRN_HMER       0x150   /* Hardware m? error recovery */
 #define        SPRN_HMEER      0x151   /* Hardware m? enable error recovery */
 #define        SPRN_HEIR       0x153   /* Hypervisor Emulated Instruction Register */
 #define SPRN_HASH1     0x3D2           /* Primary Hash Address Register */
 #define SPRN_HASH2     0x3D3           /* Secondary Hash Address Resgister */
 #define SPRN_HID0      0x3F0           /* Hardware Implementation Register 0 */
+#define HID0_HDICE_SH  (63 - 23)       /* 970 HDEC interrupt enable */
 #define HID0_EMCP      (1<<31)         /* Enable Machine Check pin */
 #define HID0_EBA       (1<<29)         /* Enable Bus Address Parity */
 #define HID0_EBD       (1<<28)         /* Enable Bus Data Parity */
 #define SPRN_IABR2     0x3FA           /* 83xx */
 #define SPRN_IBCR      0x135           /* 83xx Insn Breakpoint Control Reg */
 #define SPRN_HID4      0x3F4           /* 970 HID4 */
+#define  HID4_LPES0     (1ul << (63-0)) /* LPAR env. sel. bit 0 */
+#define         HID4_RMLS2_SH   (63 - 2)       /* Real mode limit bottom 2 bits */
+#define         HID4_LPID5_SH   (63 - 6)       /* partition ID bottom 4 bits */
+#define         HID4_RMOR_SH    (63 - 22)      /* real mode offset (16 bits) */
+#define  HID4_LPES1     (1 << (63-57)) /* LPAR env. sel. bit 1 */
+#define  HID4_RMLS0_SH  (63 - 58)      /* Real mode limit top bit */
+#define         HID4_LPID1_SH   0              /* partition ID top 2 bits */
 #define SPRN_HID4_GEKKO        0x3F3           /* Gekko HID4 */
 #define SPRN_HID5      0x3F6           /* 970 HID5 */
 #define SPRN_HID6      0x3F9   /* BE HID 6 */
        mfspr   rX,SPRN_SPRG_PACA;                      \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mfspr   rX,SPRN_SPRG_HPACA;                     \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_PACA(rX)                                   \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mtspr   SPRN_SPRG_PACA,rX;                      \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mtspr   SPRN_SPRG_HPACA,rX;                     \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define GET_SCRATCH0(rX)                               \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mfspr   rX,SPRN_SPRG_SCRATCH0;                  \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mfspr   rX,SPRN_SPRG_HSCRATCH0;                 \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #define SET_SCRATCH0(rX)                               \
        BEGIN_FTR_SECTION_NESTED(66);                   \
        mtspr   SPRN_SPRG_SCRATCH0,rX;                  \
        FTR_SECTION_ELSE_NESTED(66);                    \
        mtspr   SPRN_SPRG_HSCRATCH0,rX;                 \
-       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+       ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66)
 
 #else /* CONFIG_PPC_BOOK3S_64 */
 #define GET_SCRATCH0(rX)       mfspr   rX,SPRN_SPRG_SCRATCH0
index 0f0ad9f..9ec0b39 100644 (file)
 #define ESR_ILK                0x00100000      /* Instr. Cache Locking */
 #define ESR_PUO                0x00040000      /* Unimplemented Operation exception */
 #define ESR_BO         0x00020000      /* Byte Ordering */
+#define ESR_SPV                0x00000080      /* Signal Processing operation */
 
 /* Bit definitions related to the DBCR0. */
 #if defined(CONFIG_40x)
index 36e1c8a..54b935f 100644 (file)
@@ -128,6 +128,7 @@ int main(void)
        DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page));
        /* paca */
        DEFINE(PACA_SIZE, sizeof(struct paca_struct));
+       DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token));
        DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index));
        DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start));
        DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack));
@@ -187,7 +188,9 @@ int main(void)
        DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
        DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
        DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+       DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
        DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+       DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
        DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
        DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -198,11 +201,6 @@ int main(void)
        DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
        DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
        DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-       DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
-       DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
-       DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
-#endif
 #endif /* CONFIG_PPC64 */
 
        /* RTAS */
@@ -397,67 +395,160 @@ int main(void)
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
        DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+       DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+       DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+       DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+       DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+       DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+       DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+       DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+       DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+       DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+       DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.shregs.msr));
+       DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.shregs.srr0));
+       DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.shregs.srr1));
+       DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.shregs.sprg0));
+       DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.shregs.sprg1));
+       DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
+       DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
+#endif
        DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
        DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
        DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
        DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
        DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
+       DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1));
        DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
        DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
+       DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
 
        /* book3s */
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+       DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+       DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+       DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+       DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+       DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+       DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+       DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+       DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
+       DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
+       DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
+       DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
+#endif
 #ifdef CONFIG_PPC_BOOK3S
+       DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+       DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
        DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
        DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
-       DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+       DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+       DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+       DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+       DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+       DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+       DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+       DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
        DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
        DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
        DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
        DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
        DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+       DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+       DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+       DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions));
+       DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
+       DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+       DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+       DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+       DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+       DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
+       DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+       DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+       DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+       DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+       DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+       DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+       DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+       DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+       DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
        DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
                           offsetof(struct kvmppc_vcpu_book3s, vcpu));
-       DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
-       DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
-       DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
-       DEFINE(SVCPU_LR, offsetof(struct kvmppc_book3s_shadow_vcpu, lr));
-       DEFINE(SVCPU_PC, offsetof(struct kvmppc_book3s_shadow_vcpu, pc));
-       DEFINE(SVCPU_R0, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[0]));
-       DEFINE(SVCPU_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[1]));
-       DEFINE(SVCPU_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[2]));
-       DEFINE(SVCPU_R3, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[3]));
-       DEFINE(SVCPU_R4, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[4]));
-       DEFINE(SVCPU_R5, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[5]));
-       DEFINE(SVCPU_R6, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[6]));
-       DEFINE(SVCPU_R7, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[7]));
-       DEFINE(SVCPU_R8, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[8]));
-       DEFINE(SVCPU_R9, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[9]));
-       DEFINE(SVCPU_R10, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[10]));
-       DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
-       DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
-       DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
-       DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
-       DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
-       DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        vmhandler));
-       DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       scratch0));
-       DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       scratch1));
-       DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                       in_guest));
-       DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                          fault_dsisr));
-       DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        fault_dar));
-       DEFINE(SVCPU_LAST_INST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                        last_inst));
-       DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-                                          shadow_srr1));
+       DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+       DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+       DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#ifdef CONFIG_KVM_BOOK3S_PR
+# define SVCPU_FIELD(x, f)     DEFINE(x, offsetof(struct paca_struct, shadow_vcpu.f))
+#else
+# define SVCPU_FIELD(x, f)
+#endif
+# define HSTATE_FIELD(x, f)    DEFINE(x, offsetof(struct paca_struct, kvm_hstate.f))
+#else  /* 32-bit */
+# define SVCPU_FIELD(x, f)     DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, f))
+# define HSTATE_FIELD(x, f)    DEFINE(x, offsetof(struct kvmppc_book3s_shadow_vcpu, hstate.f))
+#endif
+
+       SVCPU_FIELD(SVCPU_CR, cr);
+       SVCPU_FIELD(SVCPU_XER, xer);
+       SVCPU_FIELD(SVCPU_CTR, ctr);
+       SVCPU_FIELD(SVCPU_LR, lr);
+       SVCPU_FIELD(SVCPU_PC, pc);
+       SVCPU_FIELD(SVCPU_R0, gpr[0]);
+       SVCPU_FIELD(SVCPU_R1, gpr[1]);
+       SVCPU_FIELD(SVCPU_R2, gpr[2]);
+       SVCPU_FIELD(SVCPU_R3, gpr[3]);
+       SVCPU_FIELD(SVCPU_R4, gpr[4]);
+       SVCPU_FIELD(SVCPU_R5, gpr[5]);
+       SVCPU_FIELD(SVCPU_R6, gpr[6]);
+       SVCPU_FIELD(SVCPU_R7, gpr[7]);
+       SVCPU_FIELD(SVCPU_R8, gpr[8]);
+       SVCPU_FIELD(SVCPU_R9, gpr[9]);
+       SVCPU_FIELD(SVCPU_R10, gpr[10]);
+       SVCPU_FIELD(SVCPU_R11, gpr[11]);
+       SVCPU_FIELD(SVCPU_R12, gpr[12]);
+       SVCPU_FIELD(SVCPU_R13, gpr[13]);
+       SVCPU_FIELD(SVCPU_FAULT_DSISR, fault_dsisr);
+       SVCPU_FIELD(SVCPU_FAULT_DAR, fault_dar);
+       SVCPU_FIELD(SVCPU_LAST_INST, last_inst);
+       SVCPU_FIELD(SVCPU_SHADOW_SRR1, shadow_srr1);
 #ifdef CONFIG_PPC_BOOK3S_32
-       DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
+       SVCPU_FIELD(SVCPU_SR, sr);
 #endif
-#else
+#ifdef CONFIG_PPC64
+       SVCPU_FIELD(SVCPU_SLB, slb);
+       SVCPU_FIELD(SVCPU_SLB_MAX, slb_max);
+#endif
+
+       HSTATE_FIELD(HSTATE_HOST_R1, host_r1);
+       HSTATE_FIELD(HSTATE_HOST_R2, host_r2);
+       HSTATE_FIELD(HSTATE_HOST_MSR, host_msr);
+       HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
+       HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
+       HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+       HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
+       HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
+       HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+       HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
+       HSTATE_FIELD(HSTATE_PMC, host_pmc);
+       HSTATE_FIELD(HSTATE_PURR, host_purr);
+       HSTATE_FIELD(HSTATE_SPURR, host_spurr);
+       HSTATE_FIELD(HSTATE_DSCR, host_dscr);
+       HSTATE_FIELD(HSTATE_DABR, dabr);
+       HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
+#else /* CONFIG_PPC_BOOK3S */
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
        DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
@@ -467,7 +558,7 @@ int main(void)
        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
        DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
 #endif /* CONFIG_PPC_BOOK3S */
-#endif
+#endif /* CONFIG_KVM */
 
 #ifdef CONFIG_KVM_GUEST
        DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
@@ -497,6 +588,13 @@ int main(void)
        DEFINE(TLBCAM_MAS7, offsetof(struct tlbcam, MAS7));
 #endif
 
+#if defined(CONFIG_KVM) && defined(CONFIG_SPE)
+       DEFINE(VCPU_EVR, offsetof(struct kvm_vcpu, arch.evr[0]));
+       DEFINE(VCPU_ACC, offsetof(struct kvm_vcpu, arch.acc));
+       DEFINE(VCPU_SPEFSCR, offsetof(struct kvm_vcpu, arch.spefscr));
+       DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
        DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
                                                arch.timing_exit.tv32.tbu));
index 4f9a93f..76797c5 100644 (file)
@@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7)
        blr
 
 __init_hvmode_206:
-       /* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */
+       /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
        mfmsr   r3
        rldicl. r0,r3,4,63
        bnelr
        ld      r5,CPU_SPEC_FEATURES(r4)
-       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206)
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
        xor     r5,r5,r6
        std     r5,CPU_SPEC_FEATURES(r4)
        blr
@@ -61,19 +61,23 @@ __init_LPCR:
         *   LPES = 0b01 (HSRR0/1 used for 0x500)
         *   PECE = 0b111
         *   DPFD = 4
+        *   HDICE = 0
+        *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+        *   VRMASD = 0b10000 (L=1, LP=00)
         *
         * Other bits untouched for now
         */
        mfspr   r3,SPRN_LPCR
-       ori     r3,r3,(LPCR_LPES0|LPCR_LPES1)
-       xori    r3,r3, LPCR_LPES0
+       li      r5,1
+       rldimi  r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
        ori     r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
-       li      r5,7
-       sldi    r5,r5,LPCR_DPFD_SH
-       andc    r3,r3,r5
        li      r5,4
-       sldi    r5,r5,LPCR_DPFD_SH
-       or      r3,r3,r5
+       rldimi  r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+       clrrdi  r3,r3,1         /* clear HDICE */
+       li      r5,4
+       rldimi  r3,r5, LPCR_VC_SH, 0
+       li      r5,0x10
+       rldimi  r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
        mtspr   SPRN_LPCR,r3
        isync
        blr
index 27f2507..12fac8d 100644 (file)
@@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970)
        /* Do nothing if not running in HV mode */
        mfmsr   r0
        rldicl. r0,r0,4,63
-       beqlr
+       beq     no_hv_mode
 
        mfspr   r0,SPRN_HID0
        li      r11,5                   /* clear DOZE and SLEEP */
@@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP)
        /* Do nothing if not running in HV mode */
        mfmsr   r0
        rldicl. r0,r0,4,63
-       beqlr
+       beq     no_hv_mode
 
        mfspr   r0,SPRN_HID0
        li      r11,0x15                /* clear DOZE and SLEEP */
@@ -109,6 +109,14 @@ load_hids:
        sync
        isync
 
+       /* Try to set LPES = 01 in HID4 */
+       mfspr   r0,SPRN_HID4
+       clrldi  r0,r0,1                 /* clear LPES0 */
+       ori     r0,r0,HID4_LPES1        /* set LPES1 */
+       sync
+       mtspr   SPRN_HID4,r0
+       isync
+
        /* Save away cpu state */
        LOAD_REG_ADDR(r5,cpu_state_storage)
 
@@ -117,11 +125,21 @@ load_hids:
        std     r3,CS_HID0(r5)
        mfspr   r3,SPRN_HID1
        std     r3,CS_HID1(r5)
-       mfspr   r3,SPRN_HID4
-       std     r3,CS_HID4(r5)
+       mfspr   r4,SPRN_HID4
+       std     r4,CS_HID4(r5)
        mfspr   r3,SPRN_HID5
        std     r3,CS_HID5(r5)
 
+       /* See if we successfully set LPES1 to 1; if not we are in Apple mode */
+       andi.   r4,r4,HID4_LPES1
+       bnelr
+
+no_hv_mode:
+       /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */
+       ld      r5,CPU_SPEC_FEATURES(r4)
+       LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
+       andc    r5,r5,r6
+       std     r5,CPU_SPEC_FEATURES(r4)
        blr
 
 /* Called with no MMU context (typically MSR:IR/DR off) to
index a85f487..41b02c7 100644 (file)
@@ -40,7 +40,6 @@ __start_interrupts:
        .globl system_reset_pSeries;
 system_reset_pSeries:
        HMT_MEDIUM;
-       DO_KVM  0x100;
        SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -50,82 +49,73 @@ BEGIN_FTR_SECTION
         * state loss at this time.
         */
        mfspr   r13,SPRN_SRR1
-       rlwinm  r13,r13,47-31,30,31
-       cmpwi   cr0,r13,1
-       bne     1f
-       b       .power7_wakeup_noloss
-1:     cmpwi   cr0,r13,2
-       bne     1f
-       b       .power7_wakeup_loss
+       rlwinm. r13,r13,47-31,30,31
+       beq     9f
+
+       /* waking up from powersave (nap) state */
+       cmpwi   cr1,r13,2
        /* Total loss of HV state is fatal, we could try to use the
         * PIR to locate a PACA, then use an emergency stack etc...
         * but for now, let's just stay stuck here
         */
-1:     cmpwi   cr0,r13,3
-       beq     .
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+       bgt     cr1,.
+       GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       lbz     r0,PACAPROCSTART(r13)
+       cmpwi   r0,0x80
+       bne     1f
+       li      r0,0
+       stb     r0,PACAPROCSTART(r13)
+       b       kvm_start_guest
+1:
+#endif
+
+       beq     cr1,2f
+       b       .power7_wakeup_noloss
+2:     b       .power7_wakeup_loss
+9:
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+                                NOTEST, 0x100)
 
        . = 0x200
-_machine_check_pSeries:
-       HMT_MEDIUM
-       DO_KVM  0x200
-       SET_SCRATCH0(r13)
-       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+machine_check_pSeries_1:
+       /* This is moved out of line as it can be patched by FW, but
+        * some code path might still want to branch into the original
+        * vector
+        */
+       b       machine_check_pSeries
 
        . = 0x300
        .globl data_access_pSeries
 data_access_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x300
        SET_SCRATCH0(r13)
+#ifndef CONFIG_POWER4_ONLY
 BEGIN_FTR_SECTION
-       GET_PACA(r13)
-       std     r9,PACA_EXSLB+EX_R9(r13)
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       mfspr   r10,SPRN_DAR
-       mfspr   r9,SPRN_DSISR
-       srdi    r10,r10,60
-       rlwimi  r10,r9,16,0x20
-       mfcr    r9
-       cmpwi   r10,0x2c
-       beq     do_stab_bolted_pSeries
-       ld      r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXGEN+EX_R11(r13)
-       ld      r11,PACA_EXSLB+EX_R9(r13)
-       std     r12,PACA_EXGEN+EX_R12(r13)
-       GET_SCRATCH0(r12)
-       std     r10,PACA_EXGEN+EX_R10(r13)
-       std     r11,PACA_EXGEN+EX_R9(r13)
-       std     r12,PACA_EXGEN+EX_R13(r13)
-       EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD)
-FTR_SECTION_ELSE
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
+       b       data_access_check_stab
+data_access_not_stab:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
+#endif
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
+                                KVMTEST_PR, 0x300)
 
        . = 0x380
        .globl data_access_slb_pSeries
 data_access_slb_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x380
        SET_SCRATCH0(r13)
-       GET_PACA(r13)
+       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
        std     r3,PACA_EXSLB+EX_R3(r13)
        mfspr   r3,SPRN_DAR
-       std     r9,PACA_EXSLB+EX_R9(r13)        /* save r9 - r12 */
-       mfcr    r9
 #ifdef __DISABLED__
        /* Keep that around for when we re-implement dynamic VSIDs */
        cmpdi   r3,0
        bge     slb_miss_user_pseries
 #endif /* __DISABLED__ */
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       mfspr   r12,SPRN_SRR1           /* and SRR1 */
+       mfspr   r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
        b       .slb_miss_realmode
 #else
@@ -147,24 +137,16 @@ data_access_slb_pSeries:
        .globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
        HMT_MEDIUM
-       DO_KVM  0x480
        SET_SCRATCH0(r13)
-       GET_PACA(r13)
+       EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
        std     r3,PACA_EXSLB+EX_R3(r13)
        mfspr   r3,SPRN_SRR0            /* SRR0 is faulting address */
-       std     r9,PACA_EXSLB+EX_R9(r13)        /* save r9 - r12 */
-       mfcr    r9
 #ifdef __DISABLED__
        /* Keep that around for when we re-implement dynamic VSIDs */
        cmpdi   r3,0
        bge     slb_miss_user_pseries
 #endif /* __DISABLED__ */
-       std     r10,PACA_EXSLB+EX_R10(r13)
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       mfspr   r12,SPRN_SRR1           /* and SRR1 */
+       mfspr   r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
        b       .slb_miss_realmode
 #else
@@ -184,26 +166,46 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
        BEGIN_FTR_SECTION
-               _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD)
+               _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
+                                           EXC_HV, SOFTEN_TEST_HV)
+               KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
        FTR_SECTION_ELSE
-               _MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV)
-       ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+               _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+                                           EXC_STD, SOFTEN_TEST_HV_201)
+               KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+       ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 
        STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600)
+
        STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x700)
+
        STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x800)
 
        MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
-       MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer)
+       MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
        STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xa00)
+
        STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xb00)
 
        . = 0xc00
        .globl  system_call_pSeries
 system_call_pSeries:
        HMT_MEDIUM
-       DO_KVM  0xc00
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+       SET_SCRATCH0(r13)
+       GET_PACA(r13)
+       std     r9,PACA_EXGEN+EX_R9(r13)
+       std     r10,PACA_EXGEN+EX_R10(r13)
+       mfcr    r9
+       KVMTEST(0xc00)
+       GET_SCRATCH0(r13)
+#endif
 BEGIN_FTR_SECTION
        cmpdi   r0,0x1ebe
        beq-    1f
@@ -220,6 +222,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
        rfid
        b       .       /* prevent speculative execution */
 
+       KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+
 /* Fast LE/BE switch system call */
 1:     mfspr   r12,SPRN_SRR1
        xori    r12,r12,MSR_LE
@@ -228,6 +232,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
        b       .
 
        STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xd00)
 
        /* At 0xe??? we have a bunch of hypervisor exceptions, we branch
         * out of line to handle them
@@ -262,30 +267,93 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
+
        STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
+
        STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x1700)
+
 #ifdef CONFIG_CBE_RAS
        STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
        . = 0x3000
 
 /*** Out of line interrupts support ***/
 
+       /* moved from 0x200 */
+machine_check_pSeries:
+       .globl machine_check_fwnmi
+machine_check_fwnmi:
+       HMT_MEDIUM
+       SET_SCRATCH0(r13)               /* save r13 */
+       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
+                                EXC_STD, KVMTEST, 0x200)
+       KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
+
+#ifndef CONFIG_POWER4_ONLY
+       /* moved from 0x300 */
+data_access_check_stab:
+       GET_PACA(r13)
+       std     r9,PACA_EXSLB+EX_R9(r13)
+       std     r10,PACA_EXSLB+EX_R10(r13)
+       mfspr   r10,SPRN_DAR
+       mfspr   r9,SPRN_DSISR
+       srdi    r10,r10,60
+       rlwimi  r10,r9,16,0x20
+#ifdef CONFIG_KVM_BOOK3S_PR
+       lbz     r9,HSTATE_IN_GUEST(r13)
+       rlwimi  r10,r9,8,0x300
+#endif
+       mfcr    r9
+       cmpwi   r10,0x2c
+       beq     do_stab_bolted_pSeries
+       mtcrf   0x80,r9
+       ld      r9,PACA_EXSLB+EX_R9(r13)
+       ld      r10,PACA_EXSLB+EX_R10(r13)
+       b       data_access_not_stab
+do_stab_bolted_pSeries:
+       std     r11,PACA_EXSLB+EX_R11(r13)
+       std     r12,PACA_EXSLB+EX_R12(r13)
+       GET_SCRATCH0(r10)
+       std     r10,PACA_EXSLB+EX_R13(r13)
+       EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
+#endif /* CONFIG_POWER4_ONLY */
+
+       KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+       KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400)
+       KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
+
+       .align  7
        /* moved from 0xe00 */
-       STD_EXCEPTION_HV(., 0xe00, h_data_storage)
-       STD_EXCEPTION_HV(., 0xe20, h_instr_storage)
-       STD_EXCEPTION_HV(., 0xe40, emulation_assist)
-       STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */
+       STD_EXCEPTION_HV(., 0xe02, h_data_storage)
+       KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
+       STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
+       STD_EXCEPTION_HV(., 0xe42, emulation_assist)
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
+       STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
+       KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
        /* moved from 0xf00 */
        STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf00)
        STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
        STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+       KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -317,14 +385,6 @@ masked_Hinterrupt:
        hrfid
        b       .
 
-       .align  7
-do_stab_bolted_pSeries:
-       std     r11,PACA_EXSLB+EX_R11(r13)
-       std     r12,PACA_EXSLB+EX_R12(r13)
-       GET_SCRATCH0(r10)
-       std     r10,PACA_EXSLB+EX_R13(r13)
-       EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
-
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Vectors for the FWNMI option.  Share common code.
@@ -334,14 +394,8 @@ do_stab_bolted_pSeries:
 system_reset_fwnmi:
        HMT_MEDIUM
        SET_SCRATCH0(r13)               /* save r13 */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
-
-       .globl machine_check_fwnmi
-      .align 7
-machine_check_fwnmi:
-       HMT_MEDIUM
-       SET_SCRATCH0(r13)               /* save r13 */
-       EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+       EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+                                NOTEST, 0x100)
 
 #endif /* CONFIG_PPC_PSERIES */
 
@@ -376,7 +430,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_PR
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
        .align  7
index 5ecf54c..fe37dd0 100644 (file)
@@ -656,7 +656,7 @@ load_up_spe:
        cmpi    0,r4,0
        beq     1f
        addi    r4,r4,THREAD    /* want THREAD of last_task_used_spe */
-       SAVE_32EVRS(0,r10,r4)
+       SAVE_32EVRS(0,r10,r4,THREAD_EVR0)
        evxor   evr10, evr10, evr10     /* clear out evr10 */
        evmwumiaa evr10, evr10, evr10   /* evr10 <- ACC = 0 * 0 + ACC */
        li      r5,THREAD_ACC
@@ -676,7 +676,7 @@ load_up_spe:
        stw     r4,THREAD_USED_SPE(r5)
        evlddx  evr4,r10,r5
        evmra   evr4,evr4
-       REST_32EVRS(0,r10,r5)
+       REST_32EVRS(0,r10,r5,THREAD_EVR0)
 #ifndef CONFIG_SMP
        subi    r4,r5,THREAD
        stw     r4,last_task_used_spe@l(r3)
@@ -787,13 +787,11 @@ _GLOBAL(giveup_spe)
        addi    r3,r3,THREAD            /* want THREAD of task */
        lwz     r5,PT_REGS(r3)
        cmpi    0,r5,0
-       SAVE_32EVRS(0, r4, r3)
+       SAVE_32EVRS(0, r4, r3, THREAD_EVR0)
        evxor   evr6, evr6, evr6        /* clear out evr6 */
        evmwumiaa evr6, evr6, evr6      /* evr6 <- ACC = 0 * 0 + ACC */
        li      r4,THREAD_ACC
        evstddx evr6, r4, r3            /* save off accumulator */
-       mfspr   r6,SPRN_SPEFSCR
-       stw     r6,THREAD_SPEFSCR(r3)   /* save spefscr register value */
        beq     1f
        lwz     r4,_MSR-STACK_FRAME_OVERHEAD(r5)
        lis     r3,MSR_SPE@h
index f8f0bc7..3a70845 100644 (file)
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
        b       .
 
 _GLOBAL(power7_wakeup_loss)
-       GET_PACA(r13)
        ld      r1,PACAR1(r13)
        REST_NVGPRS(r1)
        REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
        rfid
 
 _GLOBAL(power7_wakeup_noloss)
-       GET_PACA(r13)
        ld      r1,PACAR1(r13)
        ld      r4,_MSR(r1)
        ld      r5,_NIP(r1)
index efeb881..0a5a899 100644 (file)
@@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca)
         * if we do a GET_PACA() before the feature fixups have been
         * applied
         */
-       if (cpu_has_feature(CPU_FTR_HVMODE_206))
+       if (cpu_has_feature(CPU_FTR_HVMODE))
                mtspr(SPRN_SPRG_HPACA, local_paca);
 #endif
        mtspr(SPRN_SPRG_PACA, local_paca);
index 91e52df..ec2d0ed 100644 (file)
@@ -96,6 +96,7 @@ void flush_fp_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_fp_to_thread);
 
 void enable_kernel_fp(void)
 {
@@ -145,6 +146,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
@@ -186,6 +188,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
                preempt_enable();
        }
 }
+EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
@@ -213,6 +216,7 @@ void flush_spe_to_thread(struct task_struct *tsk)
 #ifdef CONFIG_SMP
                        BUG_ON(tsk != current);
 #endif
+                       tsk->thread.spefscr = mfspr(SPRN_SPEFSCR);
                        giveup_spe(tsk);
                }
                preempt_enable();
index 79fca26..22051ef 100644 (file)
@@ -375,6 +375,9 @@ void __init check_for_initrd(void)
 
 int threads_per_core, threads_shift;
 cpumask_t threads_core_mask;
+EXPORT_SYMBOL_GPL(threads_per_core);
+EXPORT_SYMBOL_GPL(threads_shift);
+EXPORT_SYMBOL_GPL(threads_core_mask);
 
 static void __init cpu_init_thread_core_maps(int tpc)
 {
index a88bf27..532054f 100644 (file)
@@ -63,6 +63,7 @@
 #include <asm/kexec.h>
 #include <asm/mmu_context.h>
 #include <asm/code-patching.h>
+#include <asm/kvm_ppc.h>
 
 #include "setup.h"
 
@@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p)
        /* Initialize the MMU context management stuff */
        mmu_context_init();
 
+       kvm_rma_init();
+
        ppc64_boot_msg(0x15, "Setup Done");
 }
 
index 8ebc670..09a85a9 100644 (file)
@@ -243,6 +243,7 @@ void smp_send_reschedule(int cpu)
        if (likely(smp_ops))
                smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE);
 }
+EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
index 1a01414..f19d977 100644 (file)
@@ -1387,10 +1387,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
        int code = 0;
        int err;
 
-       preempt_disable();
-       if (regs->msr & MSR_SPE)
-               giveup_spe(current);
-       preempt_enable();
+       flush_spe_to_thread(current);
 
        spefscr = current->thread.spefscr;
        fpexc_mode = current->thread.fpexc_mode;
index 5f3cff8..33aa715 100644 (file)
@@ -387,8 +387,10 @@ static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
        }
 }
 
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
 {
+       int usermode = vcpu->arch.shared->msr & MSR_PR;
+
        vcpu->arch.shadow_pid = !usermode;
 }
 
index 105b691..78133de 100644 (file)
@@ -20,7 +20,6 @@ config KVM
        bool
        select PREEMPT_NOTIFIERS
        select ANON_INODES
-       select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
        bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
        bool
        select KVM_BOOK3S_HANDLER
+       select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
        bool
        select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_PR
+       bool
+       select KVM_MMIO
+
 config KVM_BOOK3S_32
        tristate "KVM support for PowerPC book3s_32 processors"
        depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
        select KVM
        select KVM_BOOK3S_32_HANDLER
+       select KVM_BOOK3S_PR
        ---help---
          Support running unmodified book3s_32 guest kernels
          in virtual machines on book3s_32 host processors.
@@ -50,8 +55,8 @@ config KVM_BOOK3S_32
 config KVM_BOOK3S_64
        tristate "KVM support for PowerPC book3s_64 processors"
        depends on EXPERIMENTAL && PPC_BOOK3S_64
-       select KVM
        select KVM_BOOK3S_64_HANDLER
+       select KVM
        ---help---
          Support running unmodified book3s_64 and book3s_32 guest kernels
          in virtual machines on book3s_64 host processors.
@@ -61,10 +66,34 @@ config KVM_BOOK3S_64
 
          If unsure, say N.
 
+config KVM_BOOK3S_64_HV
+       bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       depends on KVM_BOOK3S_64
+       ---help---
+         Support running unmodified book3s_64 guest kernels in
+         virtual machines on POWER7 and PPC970 processors that have
+         hypervisor mode available to the host.
+
+         If you say Y here, KVM will use the hardware virtualization
+         facilities of POWER7 (and later) processors, meaning that
+         guest operating systems will run at full hardware speed
+         using supervisor and user modes.  However, this also means
+         that KVM is not usable under PowerVM (pHyp), is only usable
+         on POWER7 (or later) processors and PPC970-family processors,
+         and cannot emulate a different processor from the host processor.
+
+         If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+       def_bool y
+       depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
+       select KVM_BOOK3S_PR
+
 config KVM_440
        bool "KVM support for PowerPC 440 processors"
        depends on EXPERIMENTAL && 44x
        select KVM
+       select KVM_MMIO
        ---help---
          Support running unmodified 440 guest kernels in virtual machines on
          440 host processors.
@@ -89,6 +118,7 @@ config KVM_E500
        bool "KVM support for PowerPC E500 processors"
        depends on EXPERIMENTAL && E500
        select KVM
+       select KVM_MMIO
        ---help---
          Support running unmodified E500 guest kernels in virtual machines on
          E500 host processors.
index 4d68638..08428e2 100644 (file)
@@ -38,24 +38,42 @@ kvm-e500-objs := \
        e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
-       $(common-objs-y) \
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
+       ../../../virt/kvm/coalesced_mmio.o \
        fpu.o \
        book3s_paired_singles.o \
-       book3s.o \
+       book3s_pr.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
        book3s_64_mmu_host.o \
        book3s_64_mmu.o \
        book3s_32_mmu.o
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+
+kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+       book3s_hv.o \
+       book3s_hv_interrupts.o \
+       book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
+       book3s_hv_rm_mmu.o \
+       book3s_64_vio_hv.o \
+       book3s_hv_builtin.o
+
+kvm-book3s_64-module-objs := \
+       ../../../virt/kvm/kvm_main.o \
+       powerpc.o \
+       emulate.o \
+       book3s.o \
+       $(kvm-book3s_64-objs-y)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
        $(common-objs-y) \
        fpu.o \
        book3s_paired_singles.o \
        book3s.o \
+       book3s_pr.o \
        book3s_emulate.o \
        book3s_interrupts.o \
        book3s_mmu_hpte.o \
@@ -70,3 +88,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
+obj-y += $(kvm-book3s_64-builtin-objs-y)
index 0f95b5c..f68a34d 100644 (file)
@@ -17,7 +17,6 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
+#include <asm/page.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "trace.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define DEBUG_EXT */
-
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-                            ulong msr);
-
-/* Some compatibility defines */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define MSR_USER32 MSR_USER
-#define MSR_USER64 MSR_USER
-#define HW_PAGE_SIZE PAGE_SIZE
-#endif
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "exits",       VCPU_STAT(sum_exits) },
@@ -77,100 +68,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
-       memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
-       to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_32
-       current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
-#endif
-}
-
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-       memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
-       memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
-              sizeof(get_paca()->shadow_vcpu));
-       to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
-#endif
-
-       kvmppc_giveup_ext(vcpu, MSR_FP);
-       kvmppc_giveup_ext(vcpu, MSR_VEC);
-       kvmppc_giveup_ext(vcpu, MSR_VSX);
-}
-
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-       ulong smsr = vcpu->arch.shared->msr;
-
-       /* Guest MSR values */
-       smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
-       /* Process MSR values */
-       smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-       /* External providers the guest reserved */
-       smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
-       /* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-       smsr |= MSR_ISF | MSR_HV;
-#endif
-       vcpu->arch.shadow_msr = smsr;
-}
-
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
-{
-       ulong old_msr = vcpu->arch.shared->msr;
-
-#ifdef EXIT_DEBUG
-       printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
-#endif
-
-       msr &= to_book3s(vcpu)->msr_mask;
-       vcpu->arch.shared->msr = msr;
-       kvmppc_recalc_shadow_msr(vcpu);
-
-       if (msr & MSR_POW) {
-               if (!vcpu->arch.pending_exceptions) {
-                       kvm_vcpu_block(vcpu);
-                       vcpu->stat.halt_wakeup++;
-
-                       /* Unset POW bit after we woke up */
-                       msr &= ~MSR_POW;
-                       vcpu->arch.shared->msr = msr;
-               }
-       }
-
-       if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
-                  (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
-               kvmppc_mmu_flush_segments(vcpu);
-               kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-
-               /* Preload magic page segment when in kernel mode */
-               if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
-                       struct kvm_vcpu_arch *a = &vcpu->arch;
-
-                       if (msr & MSR_DR)
-                               kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
-                       else
-                               kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
-               }
-       }
-
-       /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
-               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-}
-
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
        vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
        vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
-       kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
+       kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
        vcpu->arch.mmu.reset_msr(vcpu);
 }
 
@@ -204,11 +106,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
                                          unsigned int vec)
 {
+       unsigned long old_pending = vcpu->arch.pending_exceptions;
+
        clear_bit(kvmppc_book3s_vec2irqprio(vec),
                  &vcpu->arch.pending_exceptions);
 
-       if (!vcpu->arch.pending_exceptions)
-               vcpu->arch.shared->int_pending = 0;
+       kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+                                 old_pending);
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -225,8 +129,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
 
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
-       to_book3s(vcpu)->prog_flags = flags;
-       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
+       /* might as well deliver this straight away */
+       kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
 }
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
@@ -266,21 +170,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 {
        int deliver = 1;
        int vec = 0;
-       ulong flags = 0ULL;
-       ulong crit_raw = vcpu->arch.shared->critical;
-       ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-       bool crit;
-
-       /* Truncate crit indicators in 32 bit mode */
-       if (!(vcpu->arch.shared->msr & MSR_SF)) {
-               crit_raw &= 0xffffffff;
-               crit_r1 &= 0xffffffff;
-       }
-
-       /* Critical section when crit == r1 */
-       crit = (crit_raw == crit_r1);
-       /* ... and we're in supervisor mode */
-       crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+       bool crit = kvmppc_critical_section(vcpu);
 
        switch (priority) {
        case BOOK3S_IRQPRIO_DECREMENTER:
@@ -315,7 +205,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
                break;
        case BOOK3S_IRQPRIO_PROGRAM:
                vec = BOOK3S_INTERRUPT_PROGRAM;
-               flags = to_book3s(vcpu)->prog_flags;
                break;
        case BOOK3S_IRQPRIO_VSX:
                vec = BOOK3S_INTERRUPT_VSX;
@@ -346,7 +235,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 #endif
 
        if (deliver)
-               kvmppc_inject_interrupt(vcpu, vec, flags);
+               kvmppc_inject_interrupt(vcpu, vec, 0);
 
        return deliver;
 }
@@ -392,64 +281,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
        }
 
        /* Tell the guest about our interrupt status */
-       if (*pending)
-               vcpu->arch.shared->int_pending = 1;
-       else if (old_pending)
-               vcpu->arch.shared->int_pending = 0;
-}
-
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
-{
-       u32 host_pvr;
-
-       vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
-       vcpu->arch.pvr = pvr;
-#ifdef CONFIG_PPC_BOOK3S_64
-       if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
-               kvmppc_mmu_book3s_64_init(vcpu);
-               to_book3s(vcpu)->hior = 0xfff00000;
-               to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
-       } else
-#endif
-       {
-               kvmppc_mmu_book3s_32_init(vcpu);
-               to_book3s(vcpu)->hior = 0;
-               to_book3s(vcpu)->msr_mask = 0xffffffffULL;
-       }
-
-       /* If we are in hypervisor level on 970, we can tell the CPU to
-        * treat DCBZ as 32 bytes store */
-       vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
-       if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
-           !strcmp(cur_cpu_spec->platform, "ppc970"))
-               vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-
-       /* Cell performs badly if MSR_FEx are set. So let's hope nobody
-          really needs them in a VM on Cell and force disable them. */
-       if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
-               to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
-
-#ifdef CONFIG_PPC_BOOK3S_32
-       /* 32 bit Book3S always has 32 byte dcbz */
-       vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-#endif
-
-       /* On some CPUs we can execute paired single operations natively */
-       asm ( "mfpvr %0" : "=r"(host_pvr));
-       switch (host_pvr) {
-       case 0x00080200:        /* lonestar 2.0 */
-       case 0x00088202:        /* lonestar 2.2 */
-       case 0x70000100:        /* gekko 1.0 */
-       case 0x00080100:        /* gekko 2.0 */
-       case 0x00083203:        /* gekko 2.3a */
-       case 0x00083213:        /* gekko 2.3b */
-       case 0x00083204:        /* gekko 2.4 */
-       case 0x00083214:        /* gekko 2.4e (8SE) - retail HW2 */
-       case 0x00087200:        /* broadway */
-               vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
-               /* Enable HID2.PSE - in case we need it later */
-               mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
-       }
+       kvmppc_update_int_pending(vcpu, *pending, old_pending);
 }
 
 pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -471,44 +303,6 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
        return gfn_to_pfn(vcpu->kvm, gfn);
 }
 
-/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
- * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
- * emulate 32 bytes dcbz length.
- *
- * The Book3s_64 inventors also realized this case and implemented a special bit
- * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
- *
- * My approach here is to patch the dcbz instruction on executing pages.
- */
-static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
-{
-       struct page *hpage;
-       u64 hpage_offset;
-       u32 *page;
-       int i;
-
-       hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-       if (is_error_page(hpage)) {
-               kvm_release_page_clean(hpage);
-               return;
-       }
-
-       hpage_offset = pte->raddr & ~PAGE_MASK;
-       hpage_offset &= ~0xFFFULL;
-       hpage_offset /= 4;
-
-       get_page(hpage);
-       page = kmap_atomic(hpage, KM_USER0);
-
-       /* patch dcbz into reserved instruction, so we trap */
-       for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
-               if ((page[i] & 0xff0007ff) == INS_DCBZ)
-                       page[i] &= 0xfffffff7;
-
-       kunmap_atomic(page, KM_USER0);
-       put_page(hpage);
-}
-
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
                         struct kvmppc_pte *pte)
 {
@@ -606,519 +400,6 @@ mmio:
        return EMULATE_DO_MMIO;
 }
 
-static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-       ulong mp_pa = vcpu->arch.magic_page_pa;
-
-       if (unlikely(mp_pa) &&
-           unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
-               return 1;
-       }
-
-       return kvm_is_visible_gfn(vcpu->kvm, gfn);
-}
-
-int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                           ulong eaddr, int vec)
-{
-       bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
-       int r = RESUME_GUEST;
-       int relocated;
-       int page_found = 0;
-       struct kvmppc_pte pte;
-       bool is_mmio = false;
-       bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
-       bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
-       u64 vsid;
-
-       relocated = data ? dr : ir;
-
-       /* Resolve real address if translation turned on */
-       if (relocated) {
-               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
-       } else {
-               pte.may_execute = true;
-               pte.may_read = true;
-               pte.may_write = true;
-               pte.raddr = eaddr & KVM_PAM;
-               pte.eaddr = eaddr;
-               pte.vpage = eaddr >> 12;
-       }
-
-       switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-       case 0:
-               pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
-               break;
-       case MSR_DR:
-       case MSR_IR:
-               vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
-
-               if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
-                       pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
-               else
-                       pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
-               pte.vpage |= vsid;
-
-               if (vsid == -1)
-                       page_found = -EINVAL;
-               break;
-       }
-
-       if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-          (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-               /*
-                * If we do the dcbz hack, we have to NX on every execution,
-                * so we can patch the executing code. This renders our guest
-                * NX-less.
-                */
-               pte.may_execute = !data;
-       }
-
-       if (page_found == -ENOENT) {
-               /* Page not found in guest PTE entries */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-               vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-               kvmppc_book3s_queue_irqprio(vcpu, vec);
-       } else if (page_found == -EPERM) {
-               /* Storage protection */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->arch.shared->dsisr =
-                       to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-               vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
-               vcpu->arch.shared->msr |=
-                       (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-               kvmppc_book3s_queue_irqprio(vcpu, vec);
-       } else if (page_found == -EINVAL) {
-               /* Page not found in guest SLB */
-               vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-               kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-       } else if (!is_mmio &&
-                  kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
-               /* The guest's PTE is not mapped yet. Map on the host */
-               kvmppc_mmu_map_page(vcpu, &pte);
-               if (data)
-                       vcpu->stat.sp_storage++;
-               else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-                       (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
-                       kvmppc_patch_dcbz(vcpu, &pte);
-       } else {
-               /* MMIO */
-               vcpu->stat.mmio_exits++;
-               vcpu->arch.paddr_accessed = pte.raddr;
-               r = kvmppc_emulate_mmio(run, vcpu);
-               if ( r == RESUME_HOST_NV )
-                       r = RESUME_HOST;
-       }
-
-       return r;
-}
-
-static inline int get_fpr_index(int i)
-{
-#ifdef CONFIG_VSX
-       i *= 2;
-#endif
-       return i;
-}
-
-/* Give up external provider (FPU, Altivec, VSX) */
-void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
-{
-       struct thread_struct *t = &current->thread;
-       u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-       u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-       u64 *thread_fpr = (u64*)t->fpr;
-       int i;
-
-       if (!(vcpu->arch.guest_owned_ext & msr))
-               return;
-
-#ifdef DEBUG_EXT
-       printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
-#endif
-
-       switch (msr) {
-       case MSR_FP:
-               giveup_fpu(current);
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-                       vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
-
-               vcpu->arch.fpscr = t->fpscr.val;
-               break;
-       case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-               giveup_altivec(current);
-               memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
-               vcpu->arch.vscr = t->vscr;
-#endif
-               break;
-       case MSR_VSX:
-#ifdef CONFIG_VSX
-               __giveup_vsx(current);
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-                       vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
-               break;
-       default:
-               BUG();
-       }
-
-       vcpu->arch.guest_owned_ext &= ~msr;
-       current->thread.regs->msr &= ~msr;
-       kvmppc_recalc_shadow_msr(vcpu);
-}
-
-static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
-{
-       ulong srr0 = kvmppc_get_pc(vcpu);
-       u32 last_inst = kvmppc_get_last_inst(vcpu);
-       int ret;
-
-       ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
-       if (ret == -ENOENT) {
-               ulong msr = vcpu->arch.shared->msr;
-
-               msr = kvmppc_set_field(msr, 33, 33, 1);
-               msr = kvmppc_set_field(msr, 34, 36, 0);
-               vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
-               kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
-               return EMULATE_AGAIN;
-       }
-
-       return EMULATE_DONE;
-}
-
-static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
-{
-
-       /* Need to do paired single emulation? */
-       if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
-               return EMULATE_DONE;
-
-       /* Read out the instruction */
-       if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
-               /* Need to emulate */
-               return EMULATE_FAIL;
-
-       return EMULATE_AGAIN;
-}
-
-/* Handle external providers (FPU, Altivec, VSX) */
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-                            ulong msr)
-{
-       struct thread_struct *t = &current->thread;
-       u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-       u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-       u64 *thread_fpr = (u64*)t->fpr;
-       int i;
-
-       /* When we have paired singles, we emulate in software */
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
-               return RESUME_GUEST;
-
-       if (!(vcpu->arch.shared->msr & msr)) {
-               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               return RESUME_GUEST;
-       }
-
-       /* We already own the ext */
-       if (vcpu->arch.guest_owned_ext & msr) {
-               return RESUME_GUEST;
-       }
-
-#ifdef DEBUG_EXT
-       printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
-#endif
-
-       current->thread.regs->msr |= msr;
-
-       switch (msr) {
-       case MSR_FP:
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-                       thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
-               t->fpscr.val = vcpu->arch.fpscr;
-               t->fpexc_mode = 0;
-               kvmppc_load_up_fpu();
-               break;
-       case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-               memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
-               t->vscr = vcpu->arch.vscr;
-               t->vrsave = -1;
-               kvmppc_load_up_altivec();
-#endif
-               break;
-       case MSR_VSX:
-#ifdef CONFIG_VSX
-               for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-                       thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-               kvmppc_load_up_vsx();
-#endif
-               break;
-       default:
-               BUG();
-       }
-
-       vcpu->arch.guest_owned_ext |= msr;
-
-       kvmppc_recalc_shadow_msr(vcpu);
-
-       return RESUME_GUEST;
-}
-
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
-{
-       int r = RESUME_HOST;
-
-       vcpu->stat.sum_exits++;
-
-       run->exit_reason = KVM_EXIT_UNKNOWN;
-       run->ready_for_interrupt_injection = 1;
-
-       trace_kvm_book3s_exit(exit_nr, vcpu);
-       kvm_resched(vcpu);
-       switch (exit_nr) {
-       case BOOK3S_INTERRUPT_INST_STORAGE:
-               vcpu->stat.pf_instruc++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-               /* We set segments as unused segments when invalidating them. So
-                * treat the respective fault as segment fault. */
-               if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-                   == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-                       r = RESUME_GUEST;
-                       break;
-               }
-#endif
-
-               /* only care about PTEG not found errors, but leave NX alone */
-               if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
-                       r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
-                       vcpu->stat.sp_instruc++;
-               } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-                         (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-                       /*
-                        * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
-                        *     so we can't use the NX bit inside the guest. Let's cross our fingers,
-                        *     that no guest that needs the dcbz hack does NX.
-                        */
-                       kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
-                       r = RESUME_GUEST;
-               } else {
-                       vcpu->arch.shared->msr |=
-                               to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       case BOOK3S_INTERRUPT_DATA_STORAGE:
-       {
-               ulong dar = kvmppc_get_fault_dar(vcpu);
-               vcpu->stat.pf_storage++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-               /* We set segments as unused segments when invalidating them. So
-                * treat the respective fault as segment fault. */
-               if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
-                       kvmppc_mmu_map_segment(vcpu, dar);
-                       r = RESUME_GUEST;
-                       break;
-               }
-#endif
-
-               /* The only case we need to handle is missing shadow PTEs */
-               if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
-                       r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
-               } else {
-                       vcpu->arch.shared->dar = dar;
-                       vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_DATA_SEGMENT:
-               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-                       vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-                       kvmppc_book3s_queue_irqprio(vcpu,
-                               BOOK3S_INTERRUPT_DATA_SEGMENT);
-               }
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_INST_SEGMENT:
-               if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
-                       kvmppc_book3s_queue_irqprio(vcpu,
-                               BOOK3S_INTERRUPT_INST_SEGMENT);
-               }
-               r = RESUME_GUEST;
-               break;
-       /* We're good on these - the host merely wanted to get our attention */
-       case BOOK3S_INTERRUPT_DECREMENTER:
-               vcpu->stat.dec_exits++;
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_EXTERNAL:
-               vcpu->stat.ext_intr_exits++;
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_PERFMON:
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_PROGRAM:
-       {
-               enum emulation_result er;
-               ulong flags;
-
-program_interrupt:
-               flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
-
-               if (vcpu->arch.shared->msr & MSR_PR) {
-#ifdef EXIT_DEBUG
-                       printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-#endif
-                       if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
-                           (INS_DCBZ & 0xfffffff7)) {
-                               kvmppc_core_queue_program(vcpu, flags);
-                               r = RESUME_GUEST;
-                               break;
-                       }
-               }
-
-               vcpu->stat.emulated_inst_exits++;
-               er = kvmppc_emulate_instruction(run, vcpu);
-               switch (er) {
-               case EMULATE_DONE:
-                       r = RESUME_GUEST_NV;
-                       break;
-               case EMULATE_AGAIN:
-                       r = RESUME_GUEST;
-                       break;
-               case EMULATE_FAIL:
-                       printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-                              __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-                       kvmppc_core_queue_program(vcpu, flags);
-                       r = RESUME_GUEST;
-                       break;
-               case EMULATE_DO_MMIO:
-                       run->exit_reason = KVM_EXIT_MMIO;
-                       r = RESUME_HOST_NV;
-                       break;
-               default:
-                       BUG();
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_SYSCALL:
-               if (vcpu->arch.osi_enabled &&
-                   (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
-                   (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
-                       /* MOL hypercalls */
-                       u64 *gprs = run->osi.gprs;
-                       int i;
-
-                       run->exit_reason = KVM_EXIT_OSI;
-                       for (i = 0; i < 32; i++)
-                               gprs[i] = kvmppc_get_gpr(vcpu, i);
-                       vcpu->arch.osi_needed = 1;
-                       r = RESUME_HOST_NV;
-               } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
-                   (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
-                       /* KVM PV hypercalls */
-                       kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
-                       r = RESUME_GUEST;
-               } else {
-                       /* Guest syscalls */
-                       vcpu->stat.syscall_exits++;
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-                       r = RESUME_GUEST;
-               }
-               break;
-       case BOOK3S_INTERRUPT_FP_UNAVAIL:
-       case BOOK3S_INTERRUPT_ALTIVEC:
-       case BOOK3S_INTERRUPT_VSX:
-       {
-               int ext_msr = 0;
-
-               switch (exit_nr) {
-               case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
-               case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
-               case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
-               }
-
-               switch (kvmppc_check_ext(vcpu, exit_nr)) {
-               case EMULATE_DONE:
-                       /* everything ok - let's enable the ext */
-                       r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
-                       break;
-               case EMULATE_FAIL:
-                       /* we need to emulate this instruction */
-                       goto program_interrupt;
-                       break;
-               default:
-                       /* nothing to worry about - go again */
-                       break;
-               }
-               break;
-       }
-       case BOOK3S_INTERRUPT_ALIGNMENT:
-               if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-                       vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
-                               kvmppc_get_last_inst(vcpu));
-                       vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
-                               kvmppc_get_last_inst(vcpu));
-                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               }
-               r = RESUME_GUEST;
-               break;
-       case BOOK3S_INTERRUPT_MACHINE_CHECK:
-       case BOOK3S_INTERRUPT_TRACE:
-               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-               r = RESUME_GUEST;
-               break;
-       default:
-               /* Ugh - bork here! What did we get? */
-               printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-                       exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
-               r = RESUME_HOST;
-               BUG();
-               break;
-       }
-
-
-       if (!(r & RESUME_HOST)) {
-               /* To avoid clobbering exit_reason, only check for signals if
-                * we aren't already exiting to userspace for some other
-                * reason. */
-               if (signal_pending(current)) {
-#ifdef EXIT_DEBUG
-                       printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
-                       vcpu->stat.signal_exits++;
-                       run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-               } else {
-                       /* In case an interrupt came in that was triggered
-                        * from userspace (like DEC), we need to check what
-                        * to inject now! */
-                       kvmppc_core_deliver_interrupts(vcpu);
-               }
-       }
-
-       trace_kvm_book3s_reenter(r, vcpu);
-
-       return r;
-}
-
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        return 0;
@@ -1179,69 +460,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-       int i;
-
-       sregs->pvr = vcpu->arch.pvr;
-
-       sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-               for (i = 0; i < 64; i++) {
-                       sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
-                       sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
-               }
-       } else {
-               for (i = 0; i < 16; i++)
-                       sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
-
-               for (i = 0; i < 8; i++) {
-                       sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
-                       sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
-               }
-       }
-
-       return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-       struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-       int i;
-
-       kvmppc_set_pvr(vcpu, sregs->pvr);
-
-       vcpu3s->sdr1 = sregs->u.s.sdr1;
-       if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-               for (i = 0; i < 64; i++) {
-                       vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
-                                                   sregs->u.s.ppc64.slb[i].slbe);
-               }
-       } else {
-               for (i = 0; i < 16; i++) {
-                       vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
-               }
-               for (i = 0; i < 8; i++) {
-                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
-                                      (u32)sregs->u.s.ppc32.ibat[i]);
-                       kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
-                                      (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
-                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
-                                      (u32)sregs->u.s.ppc32.dbat[i]);
-                       kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
-                                      (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
-               }
-       }
-
-       /* Flush the MMU after messing with the segments */
-       kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-       return 0;
-}
-
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -ENOTSUPP;
@@ -1296,202 +514,3 @@ out:
        mutex_unlock(&kvm->slots_lock);
        return r;
 }
-
-int kvmppc_core_check_processor_compat(void)
-{
-       return 0;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-       struct kvmppc_vcpu_book3s *vcpu_book3s;
-       struct kvm_vcpu *vcpu;
-       int err = -ENOMEM;
-       unsigned long p;
-
-       vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
-       if (!vcpu_book3s)
-               goto out;
-
-       vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-               kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
-       if (!vcpu_book3s->shadow_vcpu)
-               goto free_vcpu;
-
-       vcpu = &vcpu_book3s->vcpu;
-       err = kvm_vcpu_init(vcpu, kvm, id);
-       if (err)
-               goto free_shadow_vcpu;
-
-       p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-       /* the real shared page fills the last 4k of our page */
-       vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
-       if (!p)
-               goto uninit_vcpu;
-
-       vcpu->arch.host_retip = kvm_return_point;
-       vcpu->arch.host_msr = mfmsr();
-#ifdef CONFIG_PPC_BOOK3S_64
-       /* default to book3s_64 (970fx) */
-       vcpu->arch.pvr = 0x3C0301;
-#else
-       /* default to book3s_32 (750) */
-       vcpu->arch.pvr = 0x84202;
-#endif
-       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
-       vcpu_book3s->slb_nr = 64;
-
-       /* remember where some real-mode handlers are */
-       vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
-       vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
-       vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-       vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-       vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
-       vcpu->arch.shadow_msr = MSR_USER64;
-
-       err = kvmppc_mmu_init(vcpu);
-       if (err < 0)
-               goto uninit_vcpu;
-
-       return vcpu;
-
-uninit_vcpu:
-       kvm_vcpu_uninit(vcpu);
-free_shadow_vcpu:
-       kfree(vcpu_book3s->shadow_vcpu);
-free_vcpu:
-       vfree(vcpu_book3s);
-out:
-       return ERR_PTR(err);
-}
-
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
-{
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-
-       free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
-       kvm_vcpu_uninit(vcpu);
-       kfree(vcpu_book3s->shadow_vcpu);
-       vfree(vcpu_book3s);
-}
-
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-       int ret;
-       double fpr[32][TS_FPRWIDTH];
-       unsigned int fpscr;
-       int fpexc_mode;
-#ifdef CONFIG_ALTIVEC
-       vector128 vr[32];
-       vector128 vscr;
-       unsigned long uninitialized_var(vrsave);
-       int used_vr;
-#endif
-#ifdef CONFIG_VSX
-       int used_vsr;
-#endif
-       ulong ext_msr;
-
-       /* No need to go into the guest when all we do is going out */
-       if (signal_pending(current)) {
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
-       }
-
-       /* Save FPU state in stack */
-       if (current->thread.regs->msr & MSR_FP)
-               giveup_fpu(current);
-       memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
-       fpscr = current->thread.fpscr.val;
-       fpexc_mode = current->thread.fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-       /* Save Altivec state in stack */
-       used_vr = current->thread.used_vr;
-       if (used_vr) {
-               if (current->thread.regs->msr & MSR_VEC)
-                       giveup_altivec(current);
-               memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
-               vscr = current->thread.vscr;
-               vrsave = current->thread.vrsave;
-       }
-#endif
-
-#ifdef CONFIG_VSX
-       /* Save VSX state in stack */
-       used_vsr = current->thread.used_vsr;
-       if (used_vsr && (current->thread.regs->msr & MSR_VSX))
-                       __giveup_vsx(current);
-#endif
-
-       /* Remember the MSR with disabled extensions */
-       ext_msr = current->thread.regs->msr;
-
-       /* XXX we get called with irq disabled - change that! */
-       local_irq_enable();
-
-       /* Preload FPU if it's enabled */
-       if (vcpu->arch.shared->msr & MSR_FP)
-               kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-
-       ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
-
-       local_irq_disable();
-
-       current->thread.regs->msr = ext_msr;
-
-       /* Make sure we save the guest FPU/Altivec/VSX state */
-       kvmppc_giveup_ext(vcpu, MSR_FP);
-       kvmppc_giveup_ext(vcpu, MSR_VEC);
-       kvmppc_giveup_ext(vcpu, MSR_VSX);
-
-       /* Restore FPU state from stack */
-       memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
-       current->thread.fpscr.val = fpscr;
-       current->thread.fpexc_mode = fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-       /* Restore Altivec state from stack */
-       if (used_vr && current->thread.used_vr) {
-               memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
-               current->thread.vscr = vscr;
-               current->thread.vrsave = vrsave;
-       }
-       current->thread.used_vr = used_vr;
-#endif
-
-#ifdef CONFIG_VSX
-       current->thread.used_vsr = used_vsr;
-#endif
-
-       return ret;
-}
-
-static int kvmppc_book3s_init(void)
-{
-       int r;
-
-       r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-                    THIS_MODULE);
-
-       if (r)
-               return r;
-
-       r = kvmppc_mmu_hpte_sysinit();
-
-       return r;
-}
-
-static void kvmppc_book3s_exit(void)
-{
-       kvmppc_mmu_hpte_sysexit();
-       kvm_exit();
-}
-
-module_init(kvmppc_book3s_init);
-module_exit(kvmppc_book3s_exit);
index d7889ef..c6d3e19 100644 (file)
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-                               struct kvmppc_vcpu_book3s *vcpu_book3s,
+                               struct kvm_vcpu *vcpu,
                                gva_t eaddr)
 {
        int i;
        u64 esid = GET_ESID(eaddr);
        u64 esid_1t = GET_ESID_1T(eaddr);
 
-       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
                u64 cmp_esid = esid;
 
-               if (!vcpu_book3s->slb[i].valid)
+               if (!vcpu->arch.slb[i].valid)
                        continue;
 
-               if (vcpu_book3s->slb[i].tb)
+               if (vcpu->arch.slb[i].tb)
                        cmp_esid = esid_1t;
 
-               if (vcpu_book3s->slb[i].esid == cmp_esid)
-                       return &vcpu_book3s->slb[i];
+               if (vcpu->arch.slb[i].esid == cmp_esid)
+                       return &vcpu->arch.slb[i];
        }
 
        dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
                eaddr, esid, esid_1t);
-       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
-           if (vcpu_book3s->slb[i].vsid)
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
+           if (vcpu->arch.slb[i].vsid)
                dprintk("  %d: %c%c%c %llx %llx\n", i,
-                       vcpu_book3s->slb[i].valid ? 'v' : ' ',
-                       vcpu_book3s->slb[i].large ? 'l' : ' ',
-                       vcpu_book3s->slb[i].tb    ? 't' : ' ',
-                       vcpu_book3s->slb[i].esid,
-                       vcpu_book3s->slb[i].vsid);
+                       vcpu->arch.slb[i].valid ? 'v' : ' ',
+                       vcpu->arch.slb[i].large ? 'l' : ' ',
+                       vcpu->arch.slb[i].tb    ? 't' : ' ',
+                       vcpu->arch.slb[i].esid,
+                       vcpu->arch.slb[i].vsid);
        }
 
        return NULL;
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
        struct kvmppc_slb *slb;
 
-       slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr);
+       slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
        if (!slb)
                return 0;
 
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
                return 0;
        }
 
-       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
        if (!slbe)
                goto no_seg_found;
 
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
        esid_1t = GET_ESID_1T(rb);
        slb_nr = rb & 0xfff;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
        slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 
 static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return 0;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        return slbe->orige;
 }
 
 static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
-       if (slb_nr > vcpu_book3s->slb_nr)
+       if (slb_nr > vcpu->arch.slb_nr)
                return 0;
 
-       slbe = &vcpu_book3s->slb[slb_nr];
+       slbe = &vcpu->arch.slb[slb_nr];
 
        return slbe->origv;
 }
 
 static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        struct kvmppc_slb *slbe;
 
        dprintk("KVM MMU: slbie(0x%llx)\n", ea);
 
-       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea);
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 
        if (!slbe)
                return;
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 
 static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
        int i;
 
        dprintk("KVM MMU: slbia()\n");
 
-       for (i = 1; i < vcpu_book3s->slb_nr; i++)
-               vcpu_book3s->slb[i].valid = false;
+       for (i = 1; i < vcpu->arch.slb_nr; i++)
+               vcpu->arch.slb[i].valid = false;
 
        if (vcpu->arch.shared->msr & MSR_IR) {
                kvmppc_mmu_flush_segments(vcpu);
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
        ulong mp_ea = vcpu->arch.magic_page_ea;
 
        if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-               slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+               slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
                if (slb)
                        gvsid = slb->vsid;
        }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644 (file)
index 0000000..bc3a2ea
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER      24
+#define HPT_NPTEG      (1ul << (HPT_ORDER - 7))        /* 128B per pteg */
+#define HPT_HASH_MASK  (HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER        24
+#define VRMA_VSID      0x1ffffffUL     /* 1TB VSID reserved for VRMA */
+
+/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
+#define MAX_LPID_970   63
+#define NR_LPIDS       (LPID_RSVD + 1)
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+       unsigned long hpt;
+       unsigned long lpid;
+
+       hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+                              HPT_ORDER - PAGE_SHIFT);
+       if (!hpt) {
+               pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+               return -ENOMEM;
+       }
+       kvm->arch.hpt_virt = hpt;
+
+       do {
+               lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+               if (lpid >= NR_LPIDS) {
+                       pr_err("kvm_alloc_hpt: No LPIDs free\n");
+                       free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+                       return -ENOMEM;
+               }
+       } while (test_and_set_bit(lpid, lpid_inuse));
+
+       kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+       kvm->arch.lpid = lpid;
+
+       pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+       return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+       clear_bit(kvm->arch.lpid, lpid_inuse);
+       free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+       unsigned long i;
+       unsigned long npages = kvm->arch.ram_npages;
+       unsigned long pfn;
+       unsigned long *hpte;
+       unsigned long hash;
+       struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+       if (!pginfo)
+               return;
+
+       /* VRMA can't be > 1TB */
+       if (npages > 1ul << (40 - kvm->arch.ram_porder))
+               npages = 1ul << (40 - kvm->arch.ram_porder);
+       /* Can't use more than 1 HPTE per HPTEG */
+       if (npages > HPT_NPTEG)
+               npages = HPT_NPTEG;
+
+       for (i = 0; i < npages; ++i) {
+               pfn = pginfo[i].pfn;
+               if (!pfn)
+                       break;
+               /* can't use hpt_hash since va > 64 bits */
+               hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+               /*
+                * We assume that the hash table is empty and no
+                * vcpus are using it at this stage.  Since we create
+                * at most one HPTE per HPTEG, we just assume entry 7
+                * is available and use it.
+                */
+               hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+               hpte += 7 * 2;
+               /* HPTE low word - RPN, protection, etc. */
+               hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+                       HPTE_R_M | PP_RWXX;
+               wmb();
+               hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+                       (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+                       HPTE_V_LARGE | HPTE_V_VALID;
+       }
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+       unsigned long host_lpid, rsvd_lpid;
+
+       if (!cpu_has_feature(CPU_FTR_HVMODE))
+               return -EINVAL;
+
+       memset(lpid_inuse, 0, sizeof(lpid_inuse));
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+               host_lpid = mfspr(SPRN_LPID);   /* POWER7 */
+               rsvd_lpid = LPID_RSVD;
+       } else {
+               host_lpid = 0;                  /* PPC970 */
+               rsvd_lpid = MAX_LPID_970;
+       }
+
+       set_bit(host_lpid, lpid_inuse);
+       /* rsvd_lpid is reserved for use in partition switching */
+       set_bit(rsvd_lpid, lpid_inuse);
+
+       return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+       kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                               struct kvmppc_pte *gpte, bool data)
+{
+       return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_206))
+               vcpu->arch.slb_nr = 32;         /* POWER7 */
+       else
+               vcpu->arch.slb_nr = 64;
+
+       mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+       mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+       vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644 (file)
index 0000000..ea0f8c5
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+                     unsigned long ioba, unsigned long tce)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvmppc_spapr_tce_table *stt;
+
+       /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+       /*          liobn, ioba, tce); */
+
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == liobn) {
+                       unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+                       struct page *page;
+                       u64 *tbl;
+
+                       /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+                       /*          liobn, stt, stt->window_size); */
+                       if (ioba >= stt->window_size)
+                               return H_PARAMETER;
+
+                       page = stt->pages[idx / TCES_PER_PAGE];
+                       tbl = (u64 *)page_address(page);
+
+                       /* FIXME: Need to validate the TCE itself */
+                       /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+                       tbl[idx % TCES_PER_PAGE] = tce;
+                       return H_SUCCESS;
+               }
+       }
+
+       /* Didn't find the liobn, punt it to userspace */
+       return H_TOO_HARD;
+}
index 1dd5a1d..88c8f26 100644 (file)
 #include <linux/module.h>
 #include <asm/kvm_book3s.h>
 
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
-EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#else
+EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter);
+EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline);
 EXPORT_SYMBOL_GPL(kvmppc_rmcall);
 EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
 #ifdef CONFIG_ALTIVEC
@@ -30,3 +33,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
 #ifdef CONFIG_VSX
 EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
 #endif
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644 (file)
index 0000000..cc0d7f1
--- /dev/null
@@ -0,0 +1,1269 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/*
+ * For now, limit memory to 64GB and require it to be large pages.
+ * This value is chosen because it makes the ram_pginfo array be
+ * 64kB in size, which is about as large as we want to be trying
+ * to allocate with kmalloc.
+ */
+#define MAX_MEM_ORDER          36
+
+#define LARGE_PAGE_ORDER       24      /* 16MB pages */
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       local_paca->kvm_hstate.kvm_vcpu = vcpu;
+       local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+       u64 now;
+       unsigned long dec_nsec;
+
+       now = get_tb();
+       if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+               kvmppc_core_queue_dec(vcpu);
+       if (vcpu->arch.pending_exceptions)
+               return;
+       if (vcpu->arch.dec_expires != ~(u64)0) {
+               dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+                       tb_ticks_per_sec;
+               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                             HRTIMER_MODE_REL);
+       }
+
+       kvmppc_vcpu_blocked(vcpu);
+
+       kvm_vcpu_block(vcpu);
+       vcpu->stat.halt_wakeup++;
+
+       if (vcpu->arch.dec_expires != ~(u64)0)
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+       kvmppc_vcpu_unblocked(vcpu);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+       vcpu->arch.shregs.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+       vcpu->arch.pvr = pvr;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+       pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+              vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+       for (r = 0; r < 16; ++r)
+               pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+                      r, kvmppc_get_gpr(vcpu, r),
+                      r+16, kvmppc_get_gpr(vcpu, r+16));
+       pr_err("ctr = %.16lx  lr  = %.16lx\n",
+              vcpu->arch.ctr, vcpu->arch.lr);
+       pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+              vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+       pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+              vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+       pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+              vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+       pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+              vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+       pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+       pr_err("fault dar = %.16lx dsisr = %.8x\n",
+              vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+       pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+       for (r = 0; r < vcpu->arch.slb_max; ++r)
+               pr_err("  ESID = %.16llx VSID = %.16llx\n",
+                      vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+       pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+              vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1,
+              vcpu->arch.last_inst);
+}
+
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+       int r;
+       struct kvm_vcpu *v, *ret = NULL;
+
+       mutex_lock(&kvm->lock);
+       kvm_for_each_vcpu(r, v, kvm) {
+               if (v->vcpu_id == id) {
+                       ret = v;
+                       break;
+               }
+       }
+       mutex_unlock(&kvm->lock);
+       return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+       vpa->shared_proc = 1;
+       vpa->yield_count = 1;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+                                      unsigned long flags,
+                                      unsigned long vcpuid, unsigned long vpa)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long pg_index, ra, len;
+       unsigned long pg_offset;
+       void *va;
+       struct kvm_vcpu *tvcpu;
+
+       tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+       if (!tvcpu)
+               return H_PARAMETER;
+
+       flags >>= 63 - 18;
+       flags &= 7;
+       if (flags == 0 || flags == 4)
+               return H_PARAMETER;
+       if (flags < 4) {
+               if (vpa & 0x7f)
+                       return H_PARAMETER;
+               /* registering new area; convert logical addr to real */
+               pg_index = vpa >> kvm->arch.ram_porder;
+               pg_offset = vpa & (kvm->arch.ram_psize - 1);
+               if (pg_index >= kvm->arch.ram_npages)
+                       return H_PARAMETER;
+               if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+                       return H_PARAMETER;
+               ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
+               ra |= pg_offset;
+               va = __va(ra);
+               if (flags <= 1)
+                       len = *(unsigned short *)(va + 4);
+               else
+                       len = *(unsigned int *)(va + 4);
+               if (pg_offset + len > kvm->arch.ram_psize)
+                       return H_PARAMETER;
+               switch (flags) {
+               case 1:         /* register VPA */
+                       if (len < 640)
+                               return H_PARAMETER;
+                       tvcpu->arch.vpa = va;
+                       init_vpa(vcpu, va);
+                       break;
+               case 2:         /* register DTL */
+                       if (len < 48)
+                               return H_PARAMETER;
+                       if (!tvcpu->arch.vpa)
+                               return H_RESOURCE;
+                       len -= len % 48;
+                       tvcpu->arch.dtl = va;
+                       tvcpu->arch.dtl_end = va + len;
+                       break;
+               case 3:         /* register SLB shadow buffer */
+                       if (len < 8)
+                               return H_PARAMETER;
+                       if (!tvcpu->arch.vpa)
+                               return H_RESOURCE;
+                       tvcpu->arch.slb_shadow = va;
+                       len = (len - 16) / 16;
+                       tvcpu->arch.slb_shadow = va;
+                       break;
+               }
+       } else {
+               switch (flags) {
+               case 5:         /* unregister VPA */
+                       if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
+                               return H_RESOURCE;
+                       tvcpu->arch.vpa = NULL;
+                       break;
+               case 6:         /* unregister DTL */
+                       tvcpu->arch.dtl = NULL;
+                       break;
+               case 7:         /* unregister SLB shadow buffer */
+                       tvcpu->arch.slb_shadow = NULL;
+                       break;
+               }
+       }
+       return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+       unsigned long req = kvmppc_get_gpr(vcpu, 3);
+       unsigned long target, ret = H_SUCCESS;
+       struct kvm_vcpu *tvcpu;
+
+       switch (req) {
+       case H_CEDE:
+               vcpu->arch.shregs.msr |= MSR_EE;
+               vcpu->arch.ceded = 1;
+               smp_mb();
+               if (!vcpu->arch.prodded)
+                       kvmppc_vcpu_block(vcpu);
+               else
+                       vcpu->arch.prodded = 0;
+               smp_mb();
+               vcpu->arch.ceded = 0;
+               break;
+       case H_PROD:
+               target = kvmppc_get_gpr(vcpu, 4);
+               tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+               if (!tvcpu) {
+                       ret = H_PARAMETER;
+                       break;
+               }
+               tvcpu->arch.prodded = 1;
+               smp_mb();
+               if (vcpu->arch.ceded) {
+                       if (waitqueue_active(&vcpu->wq)) {
+                               wake_up_interruptible(&vcpu->wq);
+                               vcpu->stat.halt_wakeup++;
+                       }
+               }
+               break;
+       case H_CONFER:
+               break;
+       case H_REGISTER_VPA:
+               ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                       kvmppc_get_gpr(vcpu, 5),
+                                       kvmppc_get_gpr(vcpu, 6));
+               break;
+       default:
+               return RESUME_HOST;
+       }
+       kvmppc_set_gpr(vcpu, 3, ret);
+       vcpu->arch.hcall_needed = 0;
+       return RESUME_GUEST;
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                             struct task_struct *tsk)
+{
+       int r = RESUME_HOST;
+
+       vcpu->stat.sum_exits++;
+
+       run->exit_reason = KVM_EXIT_UNKNOWN;
+       run->ready_for_interrupt_injection = 1;
+       switch (vcpu->arch.trap) {
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_HV_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PERFMON:
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PROGRAM:
+       {
+               ulong flags;
+               /*
+                * Normally program interrupts are delivered directly
+                * to the guest by the hardware, but we can get here
+                * as a result of a hypervisor emulation interrupt
+                * (e40) getting turned into a 700 by BML RTAS.
+                */
+               flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+               kvmppc_core_queue_program(vcpu, flags);
+               r = RESUME_GUEST;
+               break;
+       }
+       case BOOK3S_INTERRUPT_SYSCALL:
+       {
+               /* hcall - punt to userspace */
+               int i;
+
+               if (vcpu->arch.shregs.msr & MSR_PR) {
+                       /* sc 1 from userspace - reflect to guest syscall */
+                       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL);
+                       r = RESUME_GUEST;
+                       break;
+               }
+               run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+               for (i = 0; i < 9; ++i)
+                       run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+               run->exit_reason = KVM_EXIT_PAPR_HCALL;
+               vcpu->arch.hcall_needed = 1;
+               r = RESUME_HOST;
+               break;
+       }
+       /*
+        * We get these next two if the guest does a bad real-mode access,
+        * as we have enabled VRMA (virtualized real mode area) mode in the
+        * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+        */
+       case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+               vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
+               vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
+               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_H_INST_STORAGE:
+               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+                                       0x08000000);
+               r = RESUME_GUEST;
+               break;
+       /*
+        * This occurs if the guest executes an illegal instruction.
+        * We just generate a program interrupt to the guest, since
+        * we don't emulate any guest instructions at this stage.
+        */
+       case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+               kvmppc_core_queue_program(vcpu, 0x80000);
+               r = RESUME_GUEST;
+               break;
+       default:
+               kvmppc_dump_regs(vcpu);
+               printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+                       vcpu->arch.trap, kvmppc_get_pc(vcpu),
+                       vcpu->arch.shregs.msr);
+               r = RESUME_HOST;
+               BUG();
+               break;
+       }
+
+
+       if (!(r & RESUME_HOST)) {
+               /* To avoid clobbering exit_reason, only check for signals if
+                * we aren't already exiting to userspace for some other
+                * reason. */
+               if (signal_pending(tsk)) {
+                       vcpu->stat.signal_exits++;
+                       run->exit_reason = KVM_EXIT_INTR;
+                       r = -EINTR;
+               } else {
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       }
+
+       return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       int i;
+
+       sregs->pvr = vcpu->arch.pvr;
+
+       memset(sregs, 0, sizeof(struct kvm_sregs));
+       for (i = 0; i < vcpu->arch.slb_max; i++) {
+               sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+               sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+       }
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       int i, j;
+
+       kvmppc_set_pvr(vcpu, sregs->pvr);
+
+       j = 0;
+       for (i = 0; i < vcpu->arch.slb_nr; i++) {
+               if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+                       vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+                       vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+                       ++j;
+               }
+       }
+       vcpu->arch.slb_max = j;
+
+       return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       if (cpu_has_feature(CPU_FTR_HVMODE))
+               return 0;
+       return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       struct kvm_vcpu *vcpu;
+       int err = -EINVAL;
+       int core;
+       struct kvmppc_vcore *vcore;
+
+       core = id / threads_per_core;
+       if (core >= KVM_MAX_VCORES)
+               goto out;
+
+       err = -ENOMEM;
+       vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+       if (!vcpu)
+               goto out;
+
+       err = kvm_vcpu_init(vcpu, kvm, id);
+       if (err)
+               goto free_vcpu;
+
+       vcpu->arch.shared = &vcpu->arch.shregs;
+       vcpu->arch.last_cpu = -1;
+       vcpu->arch.mmcr[0] = MMCR0_FC;
+       vcpu->arch.ctrl = CTRL_RUNLATCH;
+       /* default to host PVR, since we can't spoof it */
+       vcpu->arch.pvr = mfspr(SPRN_PVR);
+       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+       kvmppc_mmu_book3s_hv_init(vcpu);
+
+       /*
+        * Some vcpus may start out in stopped state.  If we initialize
+        * them to busy-in-host state they will stop other vcpus in the
+        * vcore from running.  Instead we initialize them to blocked
+        * state, effectively considering them to be stopped until we
+        * see the first run ioctl for them.
+        */
+       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+       init_waitqueue_head(&vcpu->arch.cpu_run);
+
+       mutex_lock(&kvm->lock);
+       vcore = kvm->arch.vcores[core];
+       if (!vcore) {
+               vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+               if (vcore) {
+                       INIT_LIST_HEAD(&vcore->runnable_threads);
+                       spin_lock_init(&vcore->lock);
+               }
+               kvm->arch.vcores[core] = vcore;
+       }
+       mutex_unlock(&kvm->lock);
+
+       if (!vcore)
+               goto free_vcpu;
+
+       spin_lock(&vcore->lock);
+       ++vcore->num_threads;
+       ++vcore->n_blocked;
+       spin_unlock(&vcore->lock);
+       vcpu->arch.vcore = vcore;
+
+       return vcpu;
+
+free_vcpu:
+       kfree(vcpu);
+out:
+       return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       kvm_vcpu_uninit(vcpu);
+       kfree(vcpu);
+}
+
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       spin_lock(&vc->lock);
+       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+       ++vc->n_blocked;
+       if (vc->n_runnable > 0 &&
+           vc->n_runnable + vc->n_blocked == vc->num_threads) {
+               vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+                                       arch.run_list);
+               wake_up(&vcpu->arch.cpu_run);
+       }
+       spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       spin_lock(&vc->lock);
+       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+       --vc->n_blocked;
+       spin_unlock(&vc->lock);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+                                  struct kvm_vcpu *vcpu)
+{
+       struct kvm_vcpu *v;
+
+       if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+               return;
+       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+       --vc->n_runnable;
+       /* decrement the physical thread id of each following vcpu */
+       v = vcpu;
+       list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
+               --v->arch.ptid;
+       list_del(&vcpu->arch.run_list);
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+       int cpu;
+       struct paca_struct *tpaca;
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+       cpu = vc->pcpu + vcpu->arch.ptid;
+       tpaca = &paca[cpu];
+       tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       tpaca->kvm_hstate.kvm_vcore = vc;
+       smp_wmb();
+#ifdef CONFIG_PPC_ICP_NATIVE
+       if (vcpu->arch.ptid) {
+               tpaca->cpu_start = 0x80;
+               tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
+               wmb();
+               xics_wake_cpu(cpu);
+               ++vc->n_woken;
+       }
+#endif
+}
+
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+       int i;
+
+       HMT_low();
+       i = 0;
+       while (vc->nap_count < vc->n_woken) {
+               if (++i >= 1000000) {
+                       pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+                              vc->nap_count, vc->n_woken);
+                       break;
+               }
+               cpu_relax();
+       }
+       HMT_medium();
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.
+ */
+static int on_primary_thread(void)
+{
+       int cpu = smp_processor_id();
+       int thr = cpu_thread_in_core(cpu);
+
+       if (thr)
+               return 0;
+       while (++thr < threads_per_core)
+               if (cpu_online(cpu + thr))
+                       return 0;
+       return 1;
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+       struct kvm_vcpu *vcpu, *vnext;
+       long ret;
+       u64 now;
+
+       /* don't start if any threads have a signal pending */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               if (signal_pending(vcpu->arch.run_task))
+                       return 0;
+
+       /*
+        * Make sure we are running on thread 0, and that
+        * secondary threads are offline.
+        * XXX we should also block attempts to bring any
+        * secondary threads online.
+        */
+       if (threads_per_core > 1 && !on_primary_thread()) {
+               list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+                       vcpu->arch.ret = -EBUSY;
+               goto out;
+       }
+
+       vc->n_woken = 0;
+       vc->nap_count = 0;
+       vc->entry_exit_count = 0;
+       vc->vcore_running = 1;
+       vc->in_guest = 0;
+       vc->pcpu = smp_processor_id();
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               kvmppc_start_thread(vcpu);
+       vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
+                               arch.run_list);
+
+       spin_unlock(&vc->lock);
+
+       preempt_disable();
+       kvm_guest_enter();
+       __kvmppc_vcore_entry(NULL, vcpu);
+
+       /* wait for secondary threads to finish writing their state to memory */
+       spin_lock(&vc->lock);
+       if (vc->nap_count < vc->n_woken)
+               kvmppc_wait_for_nap(vc);
+       /* prevent other vcpu threads from doing kvmppc_start_thread() now */
+       vc->vcore_running = 2;
+       spin_unlock(&vc->lock);
+
+       /* make sure updates to secondary vcpu structs are visible now */
+       smp_mb();
+       kvm_guest_exit();
+
+       preempt_enable();
+       kvm_resched(vcpu);
+
+       now = get_tb();
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               /* cancel pending dec exception if dec is positive */
+               if (now < vcpu->arch.dec_expires &&
+                   kvmppc_core_pending_dec(vcpu))
+                       kvmppc_core_dequeue_dec(vcpu);
+               if (!vcpu->arch.trap) {
+                       if (signal_pending(vcpu->arch.run_task)) {
+                               vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                       }
+                       continue;               /* didn't get to run */
+               }
+               ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+                                        vcpu->arch.run_task);
+               vcpu->arch.ret = ret;
+               vcpu->arch.trap = 0;
+       }
+
+       spin_lock(&vc->lock);
+ out:
+       vc->vcore_running = 0;
+       list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+                                arch.run_list) {
+               if (vcpu->arch.ret != RESUME_GUEST) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+
+       return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int ptid;
+       int wait_state;
+       struct kvmppc_vcore *vc;
+       DEFINE_WAIT(wait);
+
+       /* No need to go into the guest when all we do is going out */
+       if (signal_pending(current)) {
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* On PPC970, check that we have an RMA region */
+       if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201))
+               return -EPERM;
+
+       kvm_run->exit_reason = 0;
+       vcpu->arch.ret = RESUME_GUEST;
+       vcpu->arch.trap = 0;
+
+       flush_fp_to_thread(current);
+       flush_altivec_to_thread(current);
+       flush_vsx_to_thread(current);
+
+       /*
+        * Synchronize with other threads in this virtual core
+        */
+       vc = vcpu->arch.vcore;
+       spin_lock(&vc->lock);
+       /* This happens the first time this is called for a vcpu */
+       if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+               --vc->n_blocked;
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+       ptid = vc->n_runnable;
+       vcpu->arch.run_task = current;
+       vcpu->arch.kvm_run = kvm_run;
+       vcpu->arch.ptid = ptid;
+       list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+       ++vc->n_runnable;
+
+       wait_state = TASK_INTERRUPTIBLE;
+       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+               if (signal_pending(current)) {
+                       if (!vc->vcore_running) {
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               vcpu->arch.ret = -EINTR;
+                               break;
+                       }
+                       /* have to wait for vcore to stop executing guest */
+                       wait_state = TASK_UNINTERRUPTIBLE;
+                       smp_send_reschedule(vc->pcpu);
+               }
+
+               if (!vc->vcore_running &&
+                   vc->n_runnable + vc->n_blocked == vc->num_threads) {
+                       /* we can run now */
+                       if (kvmppc_run_core(vc))
+                               continue;
+               }
+
+               if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+                       kvmppc_start_thread(vcpu);
+
+               /* wait for other threads to come in, or wait for vcore */
+               prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+               spin_unlock(&vc->lock);
+               schedule();
+               finish_wait(&vcpu->arch.cpu_run, &wait);
+               spin_lock(&vc->lock);
+       }
+
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+               kvmppc_remove_runnable(vc, vcpu);
+       spin_unlock(&vc->lock);
+
+       return vcpu->arch.ret;
+}
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+       int r;
+
+       do {
+               r = kvmppc_run_vcpu(run, vcpu);
+
+               if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+                   !(vcpu->arch.shregs.msr & MSR_PR)) {
+                       r = kvmppc_pseries_do_hcall(vcpu);
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       } while (r == RESUME_GUEST);
+       return r;
+}
+
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+       struct kvm *kvm = stt->kvm;
+       int i;
+
+       mutex_lock(&kvm->lock);
+       list_del(&stt->list);
+       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+               __free_page(stt->pages[i]);
+       kfree(stt);
+       mutex_unlock(&kvm->lock);
+
+       kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+       struct page *page;
+
+       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+               return VM_FAULT_SIGBUS;
+
+       page = stt->pages[vmf->pgoff];
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+       .fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_ops = &kvm_spapr_tce_vm_ops;
+       return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+       struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+       release_spapr_tce_table(stt);
+       return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+       .mmap           = kvm_spapr_tce_mmap,
+       .release        = kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+                                  struct kvm_create_spapr_tce *args)
+{
+       struct kvmppc_spapr_tce_table *stt = NULL;
+       long npages;
+       int ret = -ENOMEM;
+       int i;
+
+       /* Check this LIOBN hasn't been previously allocated */
+       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+               if (stt->liobn == args->liobn)
+                       return -EBUSY;
+       }
+
+       npages = kvmppc_stt_npages(args->window_size);
+
+       stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+                     GFP_KERNEL);
+       if (!stt)
+               goto fail;
+
+       stt->liobn = args->liobn;
+       stt->window_size = args->window_size;
+       stt->kvm = kvm;
+
+       for (i = 0; i < npages; i++) {
+               stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!stt->pages[i])
+                       goto fail;
+       }
+
+       kvm_get_kvm(kvm);
+
+       mutex_lock(&kvm->lock);
+       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+       mutex_unlock(&kvm->lock);
+
+       return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+                               stt, O_RDWR);
+
+fail:
+       if (stt) {
+               for (i = 0; i < npages; i++)
+                       if (stt->pages[i])
+                               __free_page(stt->pages[i]);
+
+               kfree(stt);
+       }
+       return ret;
+}
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7 or PPC970. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+       switch (rma_size) {
+       case 32ul << 20:        /* 32 MB */
+               if (cpu_has_feature(CPU_FTR_ARCH_206))
+                       return 8;       /* only supported on POWER7 */
+               return -1;
+       case 64ul << 20:        /* 64 MB */
+               return 3;
+       case 128ul << 20:       /* 128 MB */
+               return 7;
+       case 256ul << 20:       /* 256 MB */
+               return 4;
+       case 1ul << 30:         /* 1 GB */
+               return 2;
+       case 16ul << 30:        /* 16 GB */
+               return 1;
+       case 256ul << 30:       /* 256 GB */
+               return 0;
+       default:
+               return -1;
+       }
+}
+
+static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+       struct page *page;
+
+       if (vmf->pgoff >= ri->npages)
+               return VM_FAULT_SIGBUS;
+
+       page = pfn_to_page(ri->base_pfn + vmf->pgoff);
+       get_page(page);
+       vmf->page = page;
+       return 0;
+}
+
+static const struct vm_operations_struct kvm_rma_vm_ops = {
+       .fault = kvm_rma_fault,
+};
+
+static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &kvm_rma_vm_ops;
+       return 0;
+}
+
+static int kvm_rma_release(struct inode *inode, struct file *filp)
+{
+       struct kvmppc_rma_info *ri = filp->private_data;
+
+       kvm_release_rma(ri);
+       return 0;
+}
+
+static struct file_operations kvm_rma_fops = {
+       .mmap           = kvm_rma_mmap,
+       .release        = kvm_rma_release,
+};
+
+long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
+{
+       struct kvmppc_rma_info *ri;
+       long fd;
+
+       ri = kvm_alloc_rma();
+       if (!ri)
+               return -ENOMEM;
+
+       fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR);
+       if (fd < 0)
+               kvm_release_rma(ri);
+
+       ret->rma_size = ri->npages << PAGE_SHIFT;
+       return fd;
+}
+
+static struct page *hva_to_page(unsigned long addr)
+{
+       struct page *page[1];
+       int npages;
+
+       might_sleep();
+
+       npages = get_user_pages_fast(addr, 1, 1, page);
+
+       if (unlikely(npages != 1))
+               return 0;
+
+       return page[0];
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+       unsigned long psize, porder;
+       unsigned long i, npages, totalpages;
+       unsigned long pg_ix;
+       struct kvmppc_pginfo *pginfo;
+       unsigned long hva;
+       struct kvmppc_rma_info *ri = NULL;
+       struct page *page;
+
+       /* For now, only allow 16MB pages */
+       porder = LARGE_PAGE_ORDER;
+       psize = 1ul << porder;
+       if ((mem->memory_size & (psize - 1)) ||
+           (mem->guest_phys_addr & (psize - 1))) {
+               pr_err("bad memory_size=%llx @ %llx\n",
+                      mem->memory_size, mem->guest_phys_addr);
+               return -EINVAL;
+       }
+
+       npages = mem->memory_size >> porder;
+       totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+
+       /* More memory than we have space to track? */
+       if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
+               return -EINVAL;
+
+       /* Do we already have an RMA registered? */
+       if (mem->guest_phys_addr == 0 && kvm->arch.rma)
+               return -EINVAL;
+
+       if (totalpages > kvm->arch.ram_npages)
+               kvm->arch.ram_npages = totalpages;
+
+       /* Is this one of our preallocated RMAs? */
+       if (mem->guest_phys_addr == 0) {
+               struct vm_area_struct *vma;
+
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma(current->mm, mem->userspace_addr);
+               if (vma && vma->vm_file &&
+                   vma->vm_file->f_op == &kvm_rma_fops &&
+                   mem->userspace_addr == vma->vm_start)
+                       ri = vma->vm_file->private_data;
+               up_read(&current->mm->mmap_sem);
+               if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       pr_err("CPU requires an RMO\n");
+                       return -EINVAL;
+               }
+       }
+
+       if (ri) {
+               unsigned long rma_size;
+               unsigned long lpcr;
+               long rmls;
+
+               rma_size = ri->npages << PAGE_SHIFT;
+               if (rma_size > mem->memory_size)
+                       rma_size = mem->memory_size;
+               rmls = lpcr_rmls(rma_size);
+               if (rmls < 0) {
+                       pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
+                       return -EINVAL;
+               }
+               atomic_inc(&ri->use_count);
+               kvm->arch.rma = ri;
+               kvm->arch.n_rma_pages = rma_size >> porder;
+
+               /* Update LPCR and RMOR */
+               lpcr = kvm->arch.lpcr;
+               if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       /* PPC970; insert RMLS value (split field) in HID4 */
+                       lpcr &= ~((1ul << HID4_RMLS0_SH) |
+                                 (3ul << HID4_RMLS2_SH));
+                       lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+                               ((rmls & 3) << HID4_RMLS2_SH);
+                       /* RMOR is also in HID4 */
+                       lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+                               << HID4_RMOR_SH;
+               } else {
+                       /* POWER7 */
+                       lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+                       lpcr |= rmls << LPCR_RMLS_SH;
+                       kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+               }
+               kvm->arch.lpcr = lpcr;
+               pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+                       ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
+       }
+
+       pg_ix = mem->guest_phys_addr >> porder;
+       pginfo = kvm->arch.ram_pginfo + pg_ix;
+       for (i = 0; i < npages; ++i, ++pg_ix) {
+               if (ri && pg_ix < kvm->arch.n_rma_pages) {
+                       pginfo[i].pfn = ri->base_pfn +
+                               (pg_ix << (porder - PAGE_SHIFT));
+                       continue;
+               }
+               hva = mem->userspace_addr + (i << porder);
+               page = hva_to_page(hva);
+               if (!page) {
+                       pr_err("oops, no pfn for hva %lx\n", hva);
+                       goto err;
+               }
+               /* Check it's a 16MB page */
+               if (!PageHead(page) ||
+                   compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
+                       pr_err("page at %lx isn't 16MB (o=%d)\n",
+                              hva, compound_order(page));
+                       goto err;
+               }
+               pginfo[i].pfn = page_to_pfn(page);
+       }
+
+       return 0;
+
+ err:
+       return -EINVAL;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+       if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
+           !kvm->arch.rma)
+               kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+       long r;
+       unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
+       long err = -ENOMEM;
+       unsigned long lpcr;
+
+       /* Allocate hashed page table */
+       r = kvmppc_alloc_hpt(kvm);
+       if (r)
+               return r;
+
+       INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+
+       kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
+                                      GFP_KERNEL);
+       if (!kvm->arch.ram_pginfo) {
+               pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
+                      npages * sizeof(struct kvmppc_pginfo));
+               goto out_free;
+       }
+
+       kvm->arch.ram_npages = 0;
+       kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
+       kvm->arch.ram_porder = LARGE_PAGE_ORDER;
+       kvm->arch.rma = NULL;
+       kvm->arch.n_rma_pages = 0;
+
+       kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+
+       if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+               /* PPC970; HID4 is effectively the LPCR */
+               unsigned long lpid = kvm->arch.lpid;
+               kvm->arch.host_lpid = 0;
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+               lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+               lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+                       ((lpid & 0xf) << HID4_LPID5_SH);
+       } else {
+               /* POWER7; init LPCR for virtual RMA mode */
+               kvm->arch.host_lpid = mfspr(SPRN_LPID);
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+               lpcr &= LPCR_PECE | LPCR_LPES;
+               lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+                       LPCR_VPM0 | LPCR_VRMA_L;
+       }
+       kvm->arch.lpcr = lpcr;
+
+       return 0;
+
+ out_free:
+       kvmppc_free_hpt(kvm);
+       return err;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+       struct kvmppc_pginfo *pginfo;
+       unsigned long i;
+
+       if (kvm->arch.ram_pginfo) {
+               pginfo = kvm->arch.ram_pginfo;
+               kvm->arch.ram_pginfo = NULL;
+               for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
+                       if (pginfo[i].pfn)
+                               put_page(pfn_to_page(pginfo[i].pfn));
+               kfree(pginfo);
+       }
+       if (kvm->arch.rma) {
+               kvm_release_rma(kvm->arch.rma);
+               kvm->arch.rma = NULL;
+       }
+
+       kvmppc_free_hpt(kvm);
+       WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+       return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+       return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+       return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+       int r;
+
+       r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+       if (r)
+               return r;
+
+       r = kvmppc_mmu_hv_init();
+
+       return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+       kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644 (file)
index 0000000..d431203
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/*
+ * This maintains a list of RMAs (real mode areas) for KVM guests to use.
+ * Each RMA has to be physically contiguous and of a size that the
+ * hardware supports.  PPC970 and POWER7 support 64MB, 128MB and 256MB,
+ * and other larger sizes.  Since we are unlikely to be allocate that
+ * much physically contiguous memory after the system is up and running,
+ * we preallocate a set of RMAs in early boot for KVM to use.
+ */
+static unsigned long kvm_rma_size = 64 << 20;  /* 64MB */
+static unsigned long kvm_rma_count;
+
+static int __init early_parse_rma_size(char *p)
+{
+       if (!p)
+               return 1;
+
+       kvm_rma_size = memparse(p, &p);
+
+       return 0;
+}
+early_param("kvm_rma_size", early_parse_rma_size);
+
+static int __init early_parse_rma_count(char *p)
+{
+       if (!p)
+               return 1;
+
+       kvm_rma_count = simple_strtoul(p, NULL, 0);
+
+       return 0;
+}
+early_param("kvm_rma_count", early_parse_rma_count);
+
+static struct kvmppc_rma_info *rma_info;
+static LIST_HEAD(free_rmas);
+static DEFINE_SPINLOCK(rma_lock);
+
+/* Work out RMLS (real mode limit selector) field value for a given RMA size.
+   Assumes POWER7 or PPC970. */
+static inline int lpcr_rmls(unsigned long rma_size)
+{
+       switch (rma_size) {
+       case 32ul << 20:        /* 32 MB */
+               if (cpu_has_feature(CPU_FTR_ARCH_206))
+                       return 8;       /* only supported on POWER7 */
+               return -1;
+       case 64ul << 20:        /* 64 MB */
+               return 3;
+       case 128ul << 20:       /* 128 MB */
+               return 7;
+       case 256ul << 20:       /* 256 MB */
+               return 4;
+       case 1ul << 30:         /* 1 GB */
+               return 2;
+       case 16ul << 30:        /* 16 GB */
+               return 1;
+       case 256ul << 30:       /* 256 GB */
+               return 0;
+       default:
+               return -1;
+       }
+}
+
+/*
+ * Called at boot time while the bootmem allocator is active,
+ * to allocate contiguous physical memory for the real memory
+ * areas for guests.
+ */
+void kvm_rma_init(void)
+{
+       unsigned long i;
+       unsigned long j, npages;
+       void *rma;
+       struct page *pg;
+
+       /* Only do this on PPC970 in HV mode */
+       if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+           !cpu_has_feature(CPU_FTR_ARCH_201))
+               return;
+
+       if (!kvm_rma_size || !kvm_rma_count)
+               return;
+
+       /* Check that the requested size is one supported in hardware */
+       if (lpcr_rmls(kvm_rma_size) < 0) {
+               pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size);
+               return;
+       }
+
+       npages = kvm_rma_size >> PAGE_SHIFT;
+       rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info));
+       for (i = 0; i < kvm_rma_count; ++i) {
+               rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size);
+               pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma,
+                       kvm_rma_size >> 20);
+               rma_info[i].base_virt = rma;
+               rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT;
+               rma_info[i].npages = npages;
+               list_add_tail(&rma_info[i].list, &free_rmas);
+               atomic_set(&rma_info[i].use_count, 0);
+
+               pg = pfn_to_page(rma_info[i].base_pfn);
+               for (j = 0; j < npages; ++j) {
+                       atomic_inc(&pg->_count);
+                       ++pg;
+               }
+       }
+}
+
+struct kvmppc_rma_info *kvm_alloc_rma(void)
+{
+       struct kvmppc_rma_info *ri;
+
+       ri = NULL;
+       spin_lock(&rma_lock);
+       if (!list_empty(&free_rmas)) {
+               ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list);
+               list_del(&ri->list);
+               atomic_inc(&ri->use_count);
+       }
+       spin_unlock(&rma_lock);
+       return ri;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_rma);
+
+void kvm_release_rma(struct kvmppc_rma_info *ri)
+{
+       if (atomic_dec_and_test(&ri->use_count)) {
+               spin_lock(&rma_lock);
+               list_add_tail(&ri->list, &free_rmas);
+               spin_unlock(&rma_lock);
+
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_release_rma);
+
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644 (file)
index 0000000..3f7b674
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+       /* Write correct stack frame */
+       mflr    r0
+       std     r0,PPC_LR_STKOFF(r1)
+
+       /* Save host state to the stack */
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+       /* Save non-volatile registers (r14 - r31) */
+       SAVE_NVGPRS(r1)
+
+       /* Save host DSCR */
+BEGIN_FTR_SECTION
+       mfspr   r3, SPRN_DSCR
+       std     r3, HSTATE_DSCR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save host DABR */
+       mfspr   r3, SPRN_DABR
+       std     r3, HSTATE_DABR(r13)
+
+       /* Hard-disable interrupts */
+       mfmsr   r10
+       std     r10, HSTATE_HOST_MSR(r13)
+       rldicl  r10,r10,48,1
+       rotldi  r10,r10,16
+       mtmsrd  r10,1
+
+       /* Save host PMU registers and load guest PMU registers */
+       /* R4 is live here (vcpu pointer) but not r3 or r5 */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r7, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable interrupts */
+       isync
+       ld      r3, PACALPPACAPTR(r13)  /* is the host using the PMU? */
+       lbz     r5, LPPACA_PMCINUSE(r3)
+       cmpwi   r5, 0
+       beq     31f                     /* skip if not */
+       mfspr   r5, SPRN_MMCR1
+       mfspr   r6, SPRN_MMCRA
+       std     r7, HSTATE_MMCR(r13)
+       std     r5, HSTATE_MMCR + 8(r13)
+       std     r6, HSTATE_MMCR + 16(r13)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r5, SPRN_PMC2
+       mfspr   r6, SPRN_PMC3
+       mfspr   r7, SPRN_PMC4
+       mfspr   r8, SPRN_PMC5
+       mfspr   r9, SPRN_PMC6
+BEGIN_FTR_SECTION
+       mfspr   r10, SPRN_PMC7
+       mfspr   r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       stw     r3, HSTATE_PMC(r13)
+       stw     r5, HSTATE_PMC + 4(r13)
+       stw     r6, HSTATE_PMC + 8(r13)
+       stw     r7, HSTATE_PMC + 12(r13)
+       stw     r8, HSTATE_PMC + 16(r13)
+       stw     r9, HSTATE_PMC + 20(r13)
+BEGIN_FTR_SECTION
+       stw     r10, HSTATE_PMC + 24(r13)
+       stw     r11, HSTATE_PMC + 28(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+31:
+
+       /*
+        * Put whatever is in the decrementer into the
+        * hypervisor decrementer.
+        */
+       mfspr   r8,SPRN_DEC
+       mftb    r7
+       mtspr   SPRN_HDEC,r8
+       extsw   r8,r8
+       add     r8,r8,r7
+       std     r8,HSTATE_DECEXP(r13)
+
+       /*
+        * On PPC970, if the guest vcpu has an external interrupt pending,
+        * send ourselves an IPI so as to interrupt the guest once it
+        * enables interrupts.  (It must have interrupts disabled,
+        * otherwise we would already have delivered the interrupt.)
+        */
+BEGIN_FTR_SECTION
+       ld      r0, VCPU_PENDING_EXC(r4)
+       li      r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and.    r0, r0, r7
+       beq     32f
+       mr      r31, r4
+       lhz     r3, PACAPACAINDEX(r13)
+       bl      smp_send_reschedule
+       nop
+       mr      r4, r31
+32:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+
+       /* Jump to partition switch code */
+       bl      .kvmppc_hv_entry_trampoline
+       nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are enabled again at this point.
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+       /*
+        * Register usage at this point:
+        *
+        * R1       = host R1
+        * R2       = host R2
+        * R12      = exit handler id
+        * R13      = PACA
+        */
+
+       /* Restore non-volatile host registers (r14 - r31) */
+       REST_NVGPRS(r1)
+
+       addi    r1, r1, SWITCH_FRAME_SIZE
+       ld      r0, PPC_LR_STKOFF(r1)
+       mtlr    r0
+       blr
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644 (file)
index 0000000..fcfe6b0
--- /dev/null
@@ -0,0 +1,370 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER      24
+#define HPT_NPTEG      (1ul << (HPT_ORDER - 7))        /* 128B per pteg */
+#define HPT_HASH_MASK  (HPT_NPTEG - 1)
+
+#define HPTE_V_HVLOCK  0x40UL
+
+static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+       unsigned long tmp, old;
+
+       asm volatile("  ldarx   %0,0,%2\n"
+                    "  and.    %1,%0,%3\n"
+                    "  bne     2f\n"
+                    "  ori     %0,%0,%4\n"
+                    "  stdcx.  %0,0,%2\n"
+                    "  beq+    2f\n"
+                    "  li      %1,%3\n"
+                    "2:        isync"
+                    : "=&r" (tmp), "=&r" (old)
+                    : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+                    : "cc", "memory");
+       return old == 0;
+}
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+                   long pte_index, unsigned long pteh, unsigned long ptel)
+{
+       unsigned long porder;
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long i, lpn, pa;
+       unsigned long *hpte;
+
+       /* only handle 4k, 64k and 16M pages for now */
+       porder = 12;
+       if (pteh & HPTE_V_LARGE) {
+               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+                   (ptel & 0xf000) == 0x1000) {
+                       /* 64k page */
+                       porder = 16;
+               } else if ((ptel & 0xff000) == 0) {
+                       /* 16M page */
+                       porder = 24;
+                       /* lowest AVA bit must be 0 for 16M pages */
+                       if (pteh & 0x80)
+                               return H_PARAMETER;
+               } else
+                       return H_PARAMETER;
+       }
+       lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+       if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
+               return H_PARAMETER;
+       pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+       if (!pa)
+               return H_PARAMETER;
+       /* Check WIMG */
+       if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+           (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+               return H_PARAMETER;
+       pteh &= ~0x60UL;
+       ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+       ptel |= pa;
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       if (likely((flags & H_EXACT) == 0)) {
+               pte_index &= ~7UL;
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               for (i = 0; ; ++i) {
+                       if (i == 8)
+                               return H_PTEG_FULL;
+                       if ((*hpte & HPTE_V_VALID) == 0 &&
+                           lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+                               break;
+                       hpte += 2;
+               }
+       } else {
+               i = 0;
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+                       return H_PTEG_FULL;
+       }
+       hpte[1] = ptel;
+       eieio();
+       hpte[0] = pteh;
+       asm volatile("ptesync" : : : "memory");
+       atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+       vcpu->arch.gpr[4] = pte_index + i;
+       return H_SUCCESS;
+}
+
+static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+                                     unsigned long pte_index)
+{
+       unsigned long rb, va_low;
+
+       rb = (v & ~0x7fUL) << 16;               /* AVA field */
+       va_low = pte_index >> 3;
+       if (v & HPTE_V_SECONDARY)
+               va_low = ~va_low;
+       /* xor vsid from AVA */
+       if (!(v & HPTE_V_1TB_SEG))
+               va_low ^= v >> 12;
+       else
+               va_low ^= v >> 24;
+       va_low &= 0x7ff;
+       if (v & HPTE_V_LARGE) {
+               rb |= 1;                        /* L field */
+               if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+                   (r & 0xff000)) {
+                       /* non-16MB large page, must be 64k */
+                       /* (masks depend on page size) */
+                       rb |= 0x1000;           /* page encoding in LP field */
+                       rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+                       rb |= (va_low & 0xfe);  /* AVAL field (P7 doesn't seem to care) */
+               }
+       } else {
+               /* 4kB page */
+               rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of VA */
+       }
+       rb |= (v >> 54) & 0x300;                /* B field */
+       return rb;
+}
+
+#define LOCK_TOKEN     (*(u32 *)(&get_paca()->lock_token))
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+       unsigned int tmp, old;
+       unsigned int token = LOCK_TOKEN;
+
+       asm volatile("1:lwarx   %1,0,%2\n"
+                    "  cmpwi   cr0,%1,0\n"
+                    "  bne     2f\n"
+                    "  stwcx.  %3,0,%2\n"
+                    "  bne-    1b\n"
+                    "  isync\n"
+                    "2:"
+                    : "=&r" (tmp), "=&r" (old)
+                    : "r" (lock), "r" (token)
+                    : "cc", "memory");
+       return old == 0;
+}
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+                    unsigned long pte_index, unsigned long avpn,
+                    unsigned long va)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte;
+       unsigned long v, r, rb;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       if ((hpte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
+           ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
+               hpte[0] &= ~HPTE_V_HVLOCK;
+               return H_NOT_FOUND;
+       }
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               flags |= H_LOCAL;
+       vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
+       vcpu->arch.gpr[5] = r = hpte[1];
+       rb = compute_tlbie_rb(v, r, pte_index);
+       hpte[0] = 0;
+       if (!(flags & H_LOCAL)) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+                            : : "r" (rb), "r" (kvm->arch.lpid));
+               asm volatile("ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               asm volatile("tlbiel %0" : : "r" (rb));
+               asm volatile("ptesync" : : : "memory");
+       }
+       return H_SUCCESS;
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *args = &vcpu->arch.gpr[4];
+       unsigned long *hp, tlbrb[4];
+       long int i, found;
+       long int n_inval = 0;
+       unsigned long flags, req, pte_index;
+       long int local = 0;
+       long int ret = H_SUCCESS;
+
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               local = 1;
+       for (i = 0; i < 4; ++i) {
+               pte_index = args[i * 2];
+               flags = pte_index >> 56;
+               pte_index &= ((1ul << 56) - 1);
+               req = flags >> 6;
+               flags &= 3;
+               if (req == 3)
+                       break;
+               if (req != 1 || flags == 3 ||
+                   pte_index >= (HPT_NPTEG << 3)) {
+                       /* parameter error */
+                       args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
+                       ret = H_PARAMETER;
+                       break;
+               }
+               hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               while (!lock_hpte(hp, HPTE_V_HVLOCK))
+                       cpu_relax();
+               found = 0;
+               if (hp[0] & HPTE_V_VALID) {
+                       switch (flags & 3) {
+                       case 0:         /* absolute */
+                               found = 1;
+                               break;
+                       case 1:         /* andcond */
+                               if (!(hp[0] & args[i * 2 + 1]))
+                                       found = 1;
+                               break;
+                       case 2:         /* AVPN */
+                               if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+                                       found = 1;
+                               break;
+                       }
+               }
+               if (!found) {
+                       hp[0] &= ~HPTE_V_HVLOCK;
+                       args[i * 2] = ((0x90 | flags) << 56) + pte_index;
+                       continue;
+               }
+               /* insert R and C bits from PTE */
+               flags |= (hp[1] >> 5) & 0x0c;
+               args[i * 2] = ((0x80 | flags) << 56) + pte_index;
+               tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+               hp[0] = 0;
+       }
+       if (n_inval == 0)
+               return ret;
+
+       if (!local) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < n_inval; ++i)
+                       asm volatile(PPC_TLBIE(%1,%0)
+                                    : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
+               asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               for (i = 0; i < n_inval; ++i)
+                       asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
+               asm volatile("ptesync" : : : "memory");
+       }
+       return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+                     unsigned long pte_index, unsigned long avpn,
+                     unsigned long va)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte;
+       unsigned long v, r, rb;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       if ((hpte[0] & HPTE_V_VALID) == 0 ||
+           ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
+               hpte[0] &= ~HPTE_V_HVLOCK;
+               return H_NOT_FOUND;
+       }
+       if (atomic_read(&kvm->online_vcpus) == 1)
+               flags |= H_LOCAL;
+       v = hpte[0];
+       r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+                       HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+       r |= (flags << 55) & HPTE_R_PP0;
+       r |= (flags << 48) & HPTE_R_KEY_HI;
+       r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+       rb = compute_tlbie_rb(v, r, pte_index);
+       hpte[0] = v & ~HPTE_V_VALID;
+       if (!(flags & H_LOCAL)) {
+               while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+                       cpu_relax();
+               asm volatile("ptesync" : : : "memory");
+               asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+                            : : "r" (rb), "r" (kvm->arch.lpid));
+               asm volatile("ptesync" : : : "memory");
+               kvm->arch.tlbie_lock = 0;
+       } else {
+               asm volatile("ptesync" : : : "memory");
+               asm volatile("tlbiel %0" : : "r" (rb));
+               asm volatile("ptesync" : : : "memory");
+       }
+       hpte[1] = r;
+       eieio();
+       hpte[0] = v & ~HPTE_V_HVLOCK;
+       asm volatile("ptesync" : : : "memory");
+       return H_SUCCESS;
+}
+
+static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
+{
+       long int i;
+       unsigned long offset, rpn;
+
+       offset = realaddr & (kvm->arch.ram_psize - 1);
+       rpn = (realaddr - offset) >> PAGE_SHIFT;
+       for (i = 0; i < kvm->arch.ram_npages; ++i)
+               if (rpn == kvm->arch.ram_pginfo[i].pfn)
+                       return (i << PAGE_SHIFT) + offset;
+       return HPTE_R_RPN;      /* all 1s in the RPN field */
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+                  unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       unsigned long *hpte, r;
+       int i, n = 1;
+
+       if (pte_index >= (HPT_NPTEG << 3))
+               return H_PARAMETER;
+       if (flags & H_READ_4) {
+               pte_index &= ~3;
+               n = 4;
+       }
+       for (i = 0; i < n; ++i, ++pte_index) {
+               hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+               r = hpte[1];
+               if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
+                       r = reverse_xlate(kvm, r & HPTE_R_RPN) |
+                               (r & ~HPTE_R_RPN);
+               vcpu->arch.gpr[4 + i * 2] = hpte[0];
+               vcpu->arch.gpr[5 + i * 2] = r;
+       }
+       return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644 (file)
index 0000000..6dd3358
--- /dev/null
@@ -0,0 +1,1345 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+       .globl  kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+       mfspr   r13,SPRN_SRR0
+       addi    r13,r13,4
+       mtspr   SPRN_SRR0,r13
+       GET_SCRATCH0(r13)
+       rfid
+       b       .
+
+       .globl  kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+       mfspr   r13,SPRN_HSRR0
+       addi    r13,r13,4
+       mtspr   SPRN_HSRR0,r13
+       GET_SCRATCH0(r13)
+       hrfid
+       b       .
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL(kvmppc_hv_entry_trampoline)
+       mfmsr   r10
+       LOAD_REG_ADDR(r5, kvmppc_hv_entry)
+       li      r0,MSR_RI
+       andc    r0,r10,r0
+       li      r6,MSR_IR | MSR_DR
+       andc    r6,r10,r6
+       mtmsrd  r0,1            /* clear RI in MSR */
+       mtsrr0  r5
+       mtsrr1  r6
+       RFI
+
+#define ULONG_SIZE             8
+#define VCPU_GPR(n)            (VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+#define XICS_XIRR              4
+#define XICS_QIRR              0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+       .globl  kvm_start_guest
+kvm_start_guest:
+       ld      r1,PACAEMERGSP(r13)
+       subi    r1,r1,STACK_FRAME_OVERHEAD
+
+       /* get vcpu pointer */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+
+       /* We got here with an IPI; clear it */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r0, 0xff
+       li      r6, XICS_QIRR
+       li      r7, XICS_XIRR
+       lwzcix  r8, r5, r7              /* ack the interrupt */
+       sync
+       stbcix  r0, r5, r6              /* clear it */
+       stwcix  r8, r5, r7              /* EOI it */
+
+.global kvmppc_hv_entry
+kvmppc_hv_entry:
+
+       /* Required state:
+        *
+        * R4 = vcpu pointer
+        * MSR = ~IR|DR
+        * R13 = PACA
+        * R1 = host R1
+        * all other volatile GPRS = free
+        */
+       mflr    r0
+       std     r0, HSTATE_VMHANDLER(r13)
+
+       ld      r14, VCPU_GPR(r14)(r4)
+       ld      r15, VCPU_GPR(r15)(r4)
+       ld      r16, VCPU_GPR(r16)(r4)
+       ld      r17, VCPU_GPR(r17)(r4)
+       ld      r18, VCPU_GPR(r18)(r4)
+       ld      r19, VCPU_GPR(r19)(r4)
+       ld      r20, VCPU_GPR(r20)(r4)
+       ld      r21, VCPU_GPR(r21)(r4)
+       ld      r22, VCPU_GPR(r22)(r4)
+       ld      r23, VCPU_GPR(r23)(r4)
+       ld      r24, VCPU_GPR(r24)(r4)
+       ld      r25, VCPU_GPR(r25)(r4)
+       ld      r26, VCPU_GPR(r26)(r4)
+       ld      r27, VCPU_GPR(r27)(r4)
+       ld      r28, VCPU_GPR(r28)(r4)
+       ld      r29, VCPU_GPR(r29)(r4)
+       ld      r30, VCPU_GPR(r30)(r4)
+       ld      r31, VCPU_GPR(r31)(r4)
+
+       /* Load guest PMU registers */
+       /* R4 is live here (vcpu pointer) */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+       lwz     r3, VCPU_PMC(r4)        /* always load up guest PMU registers */
+       lwz     r5, VCPU_PMC + 4(r4)    /* to prevent information leak */
+       lwz     r6, VCPU_PMC + 8(r4)
+       lwz     r7, VCPU_PMC + 12(r4)
+       lwz     r8, VCPU_PMC + 16(r4)
+       lwz     r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+       lwz     r10, VCPU_PMC + 24(r4)
+       lwz     r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       mtspr   SPRN_PMC1, r3
+       mtspr   SPRN_PMC2, r5
+       mtspr   SPRN_PMC3, r6
+       mtspr   SPRN_PMC4, r7
+       mtspr   SPRN_PMC5, r8
+       mtspr   SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+       mtspr   SPRN_PMC7, r10
+       mtspr   SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       ld      r3, VCPU_MMCR(r4)
+       ld      r5, VCPU_MMCR + 8(r4)
+       ld      r6, VCPU_MMCR + 16(r4)
+       mtspr   SPRN_MMCR1, r5
+       mtspr   SPRN_MMCRA, r6
+       mtspr   SPRN_MMCR0, r3
+       isync
+
+       /* Load up FP, VMX and VSX registers */
+       bl      kvmppc_load_fp
+
+BEGIN_FTR_SECTION
+       /* Switch DSCR to guest value */
+       ld      r5, VCPU_DSCR(r4)
+       mtspr   SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /*
+        * Set the decrementer to the guest decrementer.
+        */
+       ld      r8,VCPU_DEC_EXPIRES(r4)
+       mftb    r7
+       subf    r3,r7,r8
+       mtspr   SPRN_DEC,r3
+       stw     r3,VCPU_DEC(r4)
+
+       ld      r5, VCPU_SPRG0(r4)
+       ld      r6, VCPU_SPRG1(r4)
+       ld      r7, VCPU_SPRG2(r4)
+       ld      r8, VCPU_SPRG3(r4)
+       mtspr   SPRN_SPRG0, r5
+       mtspr   SPRN_SPRG1, r6
+       mtspr   SPRN_SPRG2, r7
+       mtspr   SPRN_SPRG3, r8
+
+       /* Save R1 in the PACA */
+       std     r1, HSTATE_HOST_R1(r13)
+
+       /* Increment yield count if they have a VPA */
+       ld      r3, VCPU_VPA(r4)
+       cmpdi   r3, 0
+       beq     25f
+       lwz     r5, LPPACA_YIELDCOUNT(r3)
+       addi    r5, r5, 1
+       stw     r5, LPPACA_YIELDCOUNT(r3)
+25:
+       /* Load up DAR and DSISR */
+       ld      r5, VCPU_DAR(r4)
+       lwz     r6, VCPU_DSISR(r4)
+       mtspr   SPRN_DAR, r5
+       mtspr   SPRN_DSISR, r6
+
+       /* Set partition DABR */
+       li      r5,3
+       ld      r6,VCPU_DABR(r4)
+       mtspr   SPRN_DABRX,r5
+       mtspr   SPRN_DABR,r6
+
+BEGIN_FTR_SECTION
+       /* Restore AMR and UAMOR, set AMOR to all 1s */
+       ld      r5,VCPU_AMR(r4)
+       ld      r6,VCPU_UAMOR(r4)
+       li      r7,-1
+       mtspr   SPRN_AMR,r5
+       mtspr   SPRN_UAMOR,r6
+       mtspr   SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Clear out SLB */
+       li      r6,0
+       slbmte  r6,r6
+       slbia
+       ptesync
+
+BEGIN_FTR_SECTION
+       b       30f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       /*
+        * POWER7 host -> guest partition switch code.
+        * We don't have to lock against concurrent tlbies,
+        * but we do have to coordinate across hardware threads.
+        */
+       /* Increment entry count iff exit count is zero. */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       addi    r9,r5,VCORE_ENTRY_EXIT
+21:    lwarx   r3,0,r9
+       cmpwi   r3,0x100                /* any threads starting to exit? */
+       bge     secondary_too_late      /* if so we're too late to the party */
+       addi    r3,r3,1
+       stwcx.  r3,0,r9
+       bne     21b
+
+       /* Primary thread switches to guest partition. */
+       ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+       lwz     r6,VCPU_PTID(r4)
+       cmpwi   r6,0
+       bne     20f
+       ld      r6,KVM_SDR1(r9)
+       lwz     r7,KVM_LPID(r9)
+       li      r0,LPID_RSVD            /* switch to reserved LPID */
+       mtspr   SPRN_LPID,r0
+       ptesync
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+       mtspr   SPRN_LPID,r7
+       isync
+       li      r0,1
+       stb     r0,VCORE_IN_GUEST(r5)   /* signal secondaries to continue */
+       b       10f
+
+       /* Secondary threads wait for primary to have done partition switch */
+20:    lbz     r0,VCORE_IN_GUEST(r5)
+       cmpwi   r0,0
+       beq     20b
+
+       /* Set LPCR.  Set the MER bit if there is a pending external irq. */
+10:    ld      r8,KVM_LPCR(r9)
+       ld      r0,VCPU_PENDING_EXC(r4)
+       li      r7,(1 << BOOK3S_IRQPRIO_EXTERNAL)
+       oris    r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       and.    r0,r0,r7
+       beq     11f
+       ori     r8,r8,LPCR_MER
+11:    mtspr   SPRN_LPCR,r8
+       ld      r8,KVM_RMOR(r9)
+       mtspr   SPRN_RMOR,r8
+       isync
+
+       /* Check if HDEC expires soon */
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,10
+       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       mr      r9,r4
+       blt     hdec_soon
+
+       /*
+        * Invalidate the TLB if we could possibly have stale TLB
+        * entries for this partition on this core due to the use
+        * of tlbiel.
+        * XXX maybe only need this on primary thread?
+        */
+       ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+       lwz     r5,VCPU_VCPUID(r4)
+       lhz     r6,PACAPACAINDEX(r13)
+       rldimi  r6,r5,0,62              /* XXX map as if threads 1:1 p:v */
+       lhz     r8,VCPU_LAST_CPU(r4)
+       sldi    r7,r6,1                 /* see if this is the same vcpu */
+       add     r7,r7,r9                /* as last ran on this pcpu */
+       lhz     r0,KVM_LAST_VCPU(r7)
+       cmpw    r6,r8                   /* on the same cpu core as last time? */
+       bne     3f
+       cmpw    r0,r5                   /* same vcpu as this core last ran? */
+       beq     1f
+3:     sth     r6,VCPU_LAST_CPU(r4)    /* if not, invalidate partition TLB */
+       sth     r5,KVM_LAST_VCPU(r7)
+       li      r6,128
+       mtctr   r6
+       li      r7,0x800                /* IS field = 0b10 */
+       ptesync
+2:     tlbiel  r7
+       addi    r7,r7,0x1000
+       bdnz    2b
+       ptesync
+1:
+
+       /* Save purr/spurr */
+       mfspr   r5,SPRN_PURR
+       mfspr   r6,SPRN_SPURR
+       std     r5,HSTATE_PURR(r13)
+       std     r6,HSTATE_SPURR(r13)
+       ld      r7,VCPU_PURR(r4)
+       ld      r8,VCPU_SPURR(r4)
+       mtspr   SPRN_PURR,r7
+       mtspr   SPRN_SPURR,r8
+       b       31f
+
+       /*
+        * PPC970 host -> guest partition switch code.
+        * We have to lock against concurrent tlbies,
+        * using native_tlbie_lock to lock against host tlbies
+        * and kvm->arch.tlbie_lock to lock against guest tlbies.
+        * We also have to invalidate the TLB since its
+        * entries aren't tagged with the LPID.
+        */
+30:    ld      r9,VCPU_KVM(r4)         /* pointer to struct kvm */
+
+       /* first take native_tlbie_lock */
+       .section ".toc","aw"
+toc_tlbie_lock:
+       .tc     native_tlbie_lock[TC],native_tlbie_lock
+       .previous
+       ld      r3,toc_tlbie_lock@toc(2)
+       lwz     r8,PACA_LOCK_TOKEN(r13)
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r7,KVM_LPCR(r9)         /* use kvm->arch.lpcr to store HID4 */
+       li      r0,0x18f
+       rotldi  r0,r0,HID4_LPID5_SH     /* all lpid bits in HID4 = 1 */
+       or      r0,r7,r0
+       ptesync
+       sync
+       mtspr   SPRN_HID4,r0            /* switch to reserved LPID */
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop native_tlbie_lock */
+
+       /* invalidate the whole TLB */
+       li      r0,256
+       mtctr   r0
+       li      r6,0
+25:    tlbiel  r6
+       addi    r6,r6,0x1000
+       bdnz    25b
+       ptesync
+
+       /* Take the guest's tlbie_lock */
+       addi    r3,r9,KVM_TLBIE_LOCK
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+       ld      r6,KVM_SDR1(r9)
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+
+       /* Set up HID4 with the guest's LPID etc. */
+       sync
+       mtspr   SPRN_HID4,r7
+       isync
+
+       /* drop the guest's tlbie_lock */
+       li      r0,0
+       stw     r0,0(r3)
+
+       /* Check if HDEC expires soon */
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,10
+       li      r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       mr      r9,r4
+       blt     hdec_soon
+
+       /* Enable HDEC interrupts */
+       mfspr   r0,SPRN_HID0
+       li      r3,1
+       rldimi  r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+       sync
+       mtspr   SPRN_HID0,r0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+
+       /* Load up guest SLB entries */
+31:    lwz     r5,VCPU_SLB_MAX(r4)
+       cmpwi   r5,0
+       beq     9f
+       mtctr   r5
+       addi    r6,r4,VCPU_SLB
+1:     ld      r8,VCPU_SLB_E(r6)
+       ld      r9,VCPU_SLB_V(r6)
+       slbmte  r9,r8
+       addi    r6,r6,VCPU_SLB_SIZE
+       bdnz    1b
+9:
+
+       /* Restore state of CTRL run bit; assume 1 on entry */
+       lwz     r5,VCPU_CTRL(r4)
+       andi.   r5,r5,1
+       bne     4f
+       mfspr   r6,SPRN_CTRLF
+       clrrdi  r6,r6,1
+       mtspr   SPRN_CTRLT,r6
+4:
+       ld      r6, VCPU_CTR(r4)
+       lwz     r7, VCPU_XER(r4)
+
+       mtctr   r6
+       mtxer   r7
+
+       /* Move SRR0 and SRR1 into the respective regs */
+       ld      r6, VCPU_SRR0(r4)
+       ld      r7, VCPU_SRR1(r4)
+       mtspr   SPRN_SRR0, r6
+       mtspr   SPRN_SRR1, r7
+
+       ld      r10, VCPU_PC(r4)
+
+       ld      r11, VCPU_MSR(r4)       /* r10 = vcpu->arch.msr & ~MSR_HV */
+       rldicl  r11, r11, 63 - MSR_HV_LG, 1
+       rotldi  r11, r11, 1 + MSR_HV_LG
+       ori     r11, r11, MSR_ME
+
+fast_guest_return:
+       mtspr   SPRN_HSRR0,r10
+       mtspr   SPRN_HSRR1,r11
+
+       /* Activate guest mode, so faults get handled by KVM */
+       li      r9, KVM_GUEST_MODE_GUEST
+       stb     r9, HSTATE_IN_GUEST(r13)
+
+       /* Enter guest */
+
+       ld      r5, VCPU_LR(r4)
+       lwz     r6, VCPU_CR(r4)
+       mtlr    r5
+       mtcr    r6
+
+       ld      r0, VCPU_GPR(r0)(r4)
+       ld      r1, VCPU_GPR(r1)(r4)
+       ld      r2, VCPU_GPR(r2)(r4)
+       ld      r3, VCPU_GPR(r3)(r4)
+       ld      r5, VCPU_GPR(r5)(r4)
+       ld      r6, VCPU_GPR(r6)(r4)
+       ld      r7, VCPU_GPR(r7)(r4)
+       ld      r8, VCPU_GPR(r8)(r4)
+       ld      r9, VCPU_GPR(r9)(r4)
+       ld      r10, VCPU_GPR(r10)(r4)
+       ld      r11, VCPU_GPR(r11)(r4)
+       ld      r12, VCPU_GPR(r12)(r4)
+       ld      r13, VCPU_GPR(r13)(r4)
+
+       ld      r4, VCPU_GPR(r4)(r4)
+
+       hrfid
+       b       .
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+       .globl  kvmppc_interrupt
+kvmppc_interrupt:
+       /*
+        * Register contents:
+        * R12          = interrupt vector
+        * R13          = PACA
+        * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+        * guest R13 saved in SPRN_SCRATCH0
+        */
+       /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+       std     r9, HSTATE_HOST_R2(r13)
+       ld      r9, HSTATE_KVM_VCPU(r13)
+
+       /* Save registers */
+
+       std     r0, VCPU_GPR(r0)(r9)
+       std     r1, VCPU_GPR(r1)(r9)
+       std     r2, VCPU_GPR(r2)(r9)
+       std     r3, VCPU_GPR(r3)(r9)
+       std     r4, VCPU_GPR(r4)(r9)
+       std     r5, VCPU_GPR(r5)(r9)
+       std     r6, VCPU_GPR(r6)(r9)
+       std     r7, VCPU_GPR(r7)(r9)
+       std     r8, VCPU_GPR(r8)(r9)
+       ld      r0, HSTATE_HOST_R2(r13)
+       std     r0, VCPU_GPR(r9)(r9)
+       std     r10, VCPU_GPR(r10)(r9)
+       std     r11, VCPU_GPR(r11)(r9)
+       ld      r3, HSTATE_SCRATCH0(r13)
+       lwz     r4, HSTATE_SCRATCH1(r13)
+       std     r3, VCPU_GPR(r12)(r9)
+       stw     r4, VCPU_CR(r9)
+
+       /* Restore R1/R2 so we can handle faults */
+       ld      r1, HSTATE_HOST_R1(r13)
+       ld      r2, PACATOC(r13)
+
+       mfspr   r10, SPRN_SRR0
+       mfspr   r11, SPRN_SRR1
+       std     r10, VCPU_SRR0(r9)
+       std     r11, VCPU_SRR1(r9)
+       andi.   r0, r12, 2              /* need to read HSRR0/1? */
+       beq     1f
+       mfspr   r10, SPRN_HSRR0
+       mfspr   r11, SPRN_HSRR1
+       clrrdi  r12, r12, 2
+1:     std     r10, VCPU_PC(r9)
+       std     r11, VCPU_MSR(r9)
+
+       GET_SCRATCH0(r3)
+       mflr    r4
+       std     r3, VCPU_GPR(r13)(r9)
+       std     r4, VCPU_LR(r9)
+
+       /* Unset guest mode */
+       li      r0, KVM_GUEST_MODE_NONE
+       stb     r0, HSTATE_IN_GUEST(r13)
+
+       stw     r12,VCPU_TRAP(r9)
+
+       /* See if this is a leftover HDEC interrupt */
+       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       bne     2f
+       mfspr   r3,SPRN_HDEC
+       cmpwi   r3,0
+       bge     ignore_hdec
+2:
+       /* See if this is something we can handle in real mode */
+       cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
+       beq     hcall_try_real_mode
+hcall_real_cont:
+
+       /* Check for mediated interrupts (could be done earlier really ...) */
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_EXTERNAL
+       bne+    1f
+       ld      r5,VCPU_KVM(r9)
+       ld      r5,KVM_LPCR(r5)
+       andi.   r0,r11,MSR_EE
+       beq     1f
+       andi.   r0,r5,LPCR_MER
+       bne     bounce_ext_interrupt
+1:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save DEC */
+       mfspr   r5,SPRN_DEC
+       mftb    r6
+       extsw   r5,r5
+       add     r5,r5,r6
+       std     r5,VCPU_DEC_EXPIRES(r9)
+
+       /* Save HEIR (HV emulation assist reg) in last_inst
+          if this is an HEI (HV emulation interrupt, e40) */
+       li      r3,-1
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+       bne     11f
+       mfspr   r3,SPRN_HEIR
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+11:    stw     r3,VCPU_LAST_INST(r9)
+
+       /* Save more register state  */
+       mfxer   r5
+       mfdar   r6
+       mfdsisr r7
+       mfctr   r8
+
+       stw     r5, VCPU_XER(r9)
+       std     r6, VCPU_DAR(r9)
+       stw     r7, VCPU_DSISR(r9)
+       std     r8, VCPU_CTR(r9)
+       /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */
+BEGIN_FTR_SECTION
+       cmpwi   r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+       beq     6f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+7:     std     r6, VCPU_FAULT_DAR(r9)
+       stw     r7, VCPU_FAULT_DSISR(r9)
+
+       /* Save guest CTRL register, set runlatch to 1 */
+       mfspr   r6,SPRN_CTRLF
+       stw     r6,VCPU_CTRL(r9)
+       andi.   r0,r6,1
+       bne     4f
+       ori     r6,r6,1
+       mtspr   SPRN_CTRLT,r6
+4:
+       /* Read the guest SLB and save it away */
+       lwz     r0,VCPU_SLB_NR(r9)      /* number of entries in SLB */
+       mtctr   r0
+       li      r6,0
+       addi    r7,r9,VCPU_SLB
+       li      r5,0
+1:     slbmfee r8,r6
+       andis.  r0,r8,SLB_ESID_V@h
+       beq     2f
+       add     r8,r8,r6                /* put index in */
+       slbmfev r3,r6
+       std     r8,VCPU_SLB_E(r7)
+       std     r3,VCPU_SLB_V(r7)
+       addi    r7,r7,VCPU_SLB_SIZE
+       addi    r5,r5,1
+2:     addi    r6,r6,1
+       bdnz    1b
+       stw     r5,VCPU_SLB_MAX(r9)
+
+       /*
+        * Save the guest PURR/SPURR
+        */
+BEGIN_FTR_SECTION
+       mfspr   r5,SPRN_PURR
+       mfspr   r6,SPRN_SPURR
+       ld      r7,VCPU_PURR(r9)
+       ld      r8,VCPU_SPURR(r9)
+       std     r5,VCPU_PURR(r9)
+       std     r6,VCPU_SPURR(r9)
+       subf    r5,r7,r5
+       subf    r6,r8,r6
+
+       /*
+        * Restore host PURR/SPURR and add guest times
+        * so that the time in the guest gets accounted.
+        */
+       ld      r3,HSTATE_PURR(r13)
+       ld      r4,HSTATE_SPURR(r13)
+       add     r3,r3,r5
+       add     r4,r4,r6
+       mtspr   SPRN_PURR,r3
+       mtspr   SPRN_SPURR,r4
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
+
+       /* Clear out SLB */
+       li      r5,0
+       slbmte  r5,r5
+       slbia
+       ptesync
+
+hdec_soon:
+BEGIN_FTR_SECTION
+       b       32f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       /*
+        * POWER7 guest -> host partition switch code.
+        * We don't have to lock against tlbies but we do
+        * have to coordinate the hardware threads.
+        */
+       /* Increment the threads-exiting-guest count in the 0xff00
+          bits of vcore->entry_exit_count */
+       lwsync
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       addi    r6,r5,VCORE_ENTRY_EXIT
+41:    lwarx   r3,0,r6
+       addi    r0,r3,0x100
+       stwcx.  r0,0,r6
+       bne     41b
+
+       /*
+        * At this point we have an interrupt that we have to pass
+        * up to the kernel or qemu; we can't handle it in real mode.
+        * Thus we have to do a partition switch, so we have to
+        * collect the other threads, if we are the first thread
+        * to take an interrupt.  To do this, we set the HDEC to 0,
+        * which causes an HDEC interrupt in all threads within 2ns
+        * because the HDEC register is shared between all 4 threads.
+        * However, we don't need to bother if this is an HDEC
+        * interrupt, since the other threads will already be on their
+        * way here in that case.
+        */
+       cmpwi   r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+       beq     40f
+       cmpwi   r3,0x100        /* Are we the first here? */
+       bge     40f
+       cmpwi   r3,1
+       ble     40f
+       li      r0,0
+       mtspr   SPRN_HDEC,r0
+40:
+
+       /* Secondary threads wait for primary to do partition switch */
+       ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       lwz     r3,VCPU_PTID(r9)
+       cmpwi   r3,0
+       beq     15f
+       HMT_LOW
+13:    lbz     r3,VCORE_IN_GUEST(r5)
+       cmpwi   r3,0
+       bne     13b
+       HMT_MEDIUM
+       b       16f
+
+       /* Primary thread waits for all the secondaries to exit guest */
+15:    lwz     r3,VCORE_ENTRY_EXIT(r5)
+       srwi    r0,r3,8
+       clrldi  r3,r3,56
+       cmpw    r3,r0
+       bne     15b
+       isync
+
+       /* Primary thread switches back to host partition */
+       ld      r6,KVM_HOST_SDR1(r4)
+       lwz     r7,KVM_HOST_LPID(r4)
+       li      r8,LPID_RSVD            /* switch to reserved LPID */
+       mtspr   SPRN_LPID,r8
+       ptesync
+       mtspr   SPRN_SDR1,r6            /* switch to partition page table */
+       mtspr   SPRN_LPID,r7
+       isync
+       li      r0,0
+       stb     r0,VCORE_IN_GUEST(r5)
+       lis     r8,0x7fff               /* MAX_INT@h */
+       mtspr   SPRN_HDEC,r8
+
+16:    ld      r8,KVM_HOST_LPCR(r4)
+       mtspr   SPRN_LPCR,r8
+       isync
+       b       33f
+
+       /*
+        * PPC970 guest -> host partition switch code.
+        * We have to lock against concurrent tlbies, and
+        * we have to flush the whole TLB.
+        */
+32:    ld      r4,VCPU_KVM(r9)         /* pointer to struct kvm */
+
+       /* Take the guest's tlbie_lock */
+       lwz     r8,PACA_LOCK_TOKEN(r13)
+       addi    r3,r4,KVM_TLBIE_LOCK
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r7,KVM_HOST_LPCR(r4)    /* use kvm->arch.host_lpcr for HID4 */
+       li      r0,0x18f
+       rotldi  r0,r0,HID4_LPID5_SH     /* all lpid bits in HID4 = 1 */
+       or      r0,r7,r0
+       ptesync
+       sync
+       mtspr   SPRN_HID4,r0            /* switch to reserved LPID */
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop guest tlbie_lock */
+
+       /* invalidate the whole TLB */
+       li      r0,256
+       mtctr   r0
+       li      r6,0
+25:    tlbiel  r6
+       addi    r6,r6,0x1000
+       bdnz    25b
+       ptesync
+
+       /* take native_tlbie_lock */
+       ld      r3,toc_tlbie_lock@toc(2)
+24:    lwarx   r0,0,r3
+       cmpwi   r0,0
+       bne     24b
+       stwcx.  r8,0,r3
+       bne     24b
+       isync
+
+       ld      r6,KVM_HOST_SDR1(r4)
+       mtspr   SPRN_SDR1,r6            /* switch to host page table */
+
+       /* Set up host HID4 value */
+       sync
+       mtspr   SPRN_HID4,r7
+       isync
+       li      r0,0
+       stw     r0,0(r3)                /* drop native_tlbie_lock */
+
+       lis     r8,0x7fff               /* MAX_INT@h */
+       mtspr   SPRN_HDEC,r8
+
+       /* Disable HDEC interrupts */
+       mfspr   r0,SPRN_HID0
+       li      r3,0
+       rldimi  r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1
+       sync
+       mtspr   SPRN_HID0,r0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+       mfspr   r0,SPRN_HID0
+
+       /* load host SLB entries */
+33:    ld      r8,PACA_SLBSHADOWPTR(r13)
+
+       .rept   SLB_NUM_BOLTED
+       ld      r5,SLBSHADOW_SAVEAREA(r8)
+       ld      r6,SLBSHADOW_SAVEAREA+8(r8)
+       andis.  r7,r5,SLB_ESID_V@h
+       beq     1f
+       slbmte  r6,r5
+1:     addi    r8,r8,16
+       .endr
+
+       /* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
+       mfspr   r5,SPRN_AMR
+       mfspr   r6,SPRN_UAMOR
+       std     r5,VCPU_AMR(r9)
+       std     r6,VCPU_UAMOR(r9)
+       li      r6,0
+       mtspr   SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Restore host DABR and DABRX */
+       ld      r5,HSTATE_DABR(r13)
+       li      r6,7
+       mtspr   SPRN_DABR,r5
+       mtspr   SPRN_DABRX,r6
+
+       /* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
+       mfspr   r8, SPRN_DSCR
+       ld      r7, HSTATE_DSCR(r13)
+       std     r8, VCPU_DSCR(r7)
+       mtspr   SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /* Save non-volatile GPRs */
+       std     r14, VCPU_GPR(r14)(r9)
+       std     r15, VCPU_GPR(r15)(r9)
+       std     r16, VCPU_GPR(r16)(r9)
+       std     r17, VCPU_GPR(r17)(r9)
+       std     r18, VCPU_GPR(r18)(r9)
+       std     r19, VCPU_GPR(r19)(r9)
+       std     r20, VCPU_GPR(r20)(r9)
+       std     r21, VCPU_GPR(r21)(r9)
+       std     r22, VCPU_GPR(r22)(r9)
+       std     r23, VCPU_GPR(r23)(r9)
+       std     r24, VCPU_GPR(r24)(r9)
+       std     r25, VCPU_GPR(r25)(r9)
+       std     r26, VCPU_GPR(r26)(r9)
+       std     r27, VCPU_GPR(r27)(r9)
+       std     r28, VCPU_GPR(r28)(r9)
+       std     r29, VCPU_GPR(r29)(r9)
+       std     r30, VCPU_GPR(r30)(r9)
+       std     r31, VCPU_GPR(r31)(r9)
+
+       /* Save SPRGs */
+       mfspr   r3, SPRN_SPRG0
+       mfspr   r4, SPRN_SPRG1
+       mfspr   r5, SPRN_SPRG2
+       mfspr   r6, SPRN_SPRG3
+       std     r3, VCPU_SPRG0(r9)
+       std     r4, VCPU_SPRG1(r9)
+       std     r5, VCPU_SPRG2(r9)
+       std     r6, VCPU_SPRG3(r9)
+
+       /* Increment yield count if they have a VPA */
+       ld      r8, VCPU_VPA(r9)        /* do they have a VPA? */
+       cmpdi   r8, 0
+       beq     25f
+       lwz     r3, LPPACA_YIELDCOUNT(r8)
+       addi    r3, r3, 1
+       stw     r3, LPPACA_YIELDCOUNT(r8)
+25:
+       /* Save PMU registers if requested */
+       /* r8 and cr0.eq are live here */
+       li      r3, 1
+       sldi    r3, r3, 31              /* MMCR0_FC (freeze counters) bit */
+       mfspr   r4, SPRN_MMCR0          /* save MMCR0 */
+       mtspr   SPRN_MMCR0, r3          /* freeze all counters, disable ints */
+       isync
+       beq     21f                     /* if no VPA, save PMU stuff anyway */
+       lbz     r7, LPPACA_PMCINUSE(r8)
+       cmpwi   r7, 0                   /* did they ask for PMU stuff to be saved? */
+       bne     21f
+       std     r3, VCPU_MMCR(r9)       /* if not, set saved MMCR0 to FC */
+       b       22f
+21:    mfspr   r5, SPRN_MMCR1
+       mfspr   r6, SPRN_MMCRA
+       std     r4, VCPU_MMCR(r9)
+       std     r5, VCPU_MMCR + 8(r9)
+       std     r6, VCPU_MMCR + 16(r9)
+       mfspr   r3, SPRN_PMC1
+       mfspr   r4, SPRN_PMC2
+       mfspr   r5, SPRN_PMC3
+       mfspr   r6, SPRN_PMC4
+       mfspr   r7, SPRN_PMC5
+       mfspr   r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+       mfspr   r10, SPRN_PMC7
+       mfspr   r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+       stw     r3, VCPU_PMC(r9)
+       stw     r4, VCPU_PMC + 4(r9)
+       stw     r5, VCPU_PMC + 8(r9)
+       stw     r6, VCPU_PMC + 12(r9)
+       stw     r7, VCPU_PMC + 16(r9)
+       stw     r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+       stw     r10, VCPU_PMC + 24(r9)
+       stw     r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+22:
+       /* save FP state */
+       mr      r3, r9
+       bl      .kvmppc_save_fp
+
+       /* Secondary threads go off to take a nap on POWER7 */
+BEGIN_FTR_SECTION
+       lwz     r0,VCPU_PTID(r3)
+       cmpwi   r0,0
+       bne     secondary_nap
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+       /*
+        * Reload DEC.  HDEC interrupts were disabled when
+        * we reloaded the host's LPCR value.
+        */
+       ld      r3, HSTATE_DECEXP(r13)