KVM: PPC: Emulator: clean up SPR reads and writes
[linux-3.10.git] / arch / powerpc / kvm / book3s_hv.c
index dc70e77..db36598 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/preempt.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
+#include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
 #include <linux/cpumask.h>
 #include <asm/processor.h>
 #include <asm/cputhreads.h>
 #include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/switch_to.h>
 #include <linux/gfp.h>
-#include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
-
-/*
- * For now, limit memory to 64GB and require it to be large pages.
- * This value is chosen because it makes the ram_pginfo array be
- * 64kB in size, which is about as large as we want to be trying
- * to allocate with kmalloc.
- */
-#define MAX_MEM_ORDER          36
-
-#define LARGE_PAGE_ORDER       24      /* 16MB pages */
+#include <linux/hugetlb.h>
 
 /* #define EXIT_DEBUG */
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu);
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
        local_paca->kvm_hstate.kvm_vcpu = vcpu;
-       local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore;
+       local_paca->kvm_hstate.kvm_vcore = vc;
+       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+               vc->stolen_tb += mftb() - vc->preempt_tb;
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-}
-
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
-
-void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
-{
-       u64 now;
-       unsigned long dec_nsec;
-
-       now = get_tb();
-       if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
-               kvmppc_core_queue_dec(vcpu);
-       if (vcpu->arch.pending_exceptions)
-               return;
-       if (vcpu->arch.dec_expires != ~(u64)0) {
-               dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
-                       tb_ticks_per_sec;
-               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-                             HRTIMER_MODE_REL);
-       }
-
-       kvmppc_vcpu_blocked(vcpu);
-
-       kvm_vcpu_block(vcpu);
-       vcpu->stat.halt_wakeup++;
-
-       if (vcpu->arch.dec_expires != ~(u64)0)
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+       struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-       kvmppc_vcpu_unblocked(vcpu);
+       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+               vc->preempt_tb = mftb();
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
 {
        vcpu->arch.shregs.msr = msr;
+       kvmppc_end_cede(vcpu);
 }
 
 void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -168,85 +142,204 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
        vpa->yield_count = 1;
 }
 
+/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
+struct reg_vpa {
+       u32 dummy;
+       union {
+               u16 hword;
+               u32 word;
+       } length;
+};
+
+static int vpa_is_registered(struct kvmppc_vpa *vpap)
+{
+       if (vpap->update_pending)
+               return vpap->next_gpa != 0;
+       return vpap->pinned_addr != NULL;
+}
+
 static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
                                       unsigned long flags,
                                       unsigned long vcpuid, unsigned long vpa)
 {
        struct kvm *kvm = vcpu->kvm;
-       unsigned long pg_index, ra, len;
-       unsigned long pg_offset;
+       unsigned long len, nb;
        void *va;
        struct kvm_vcpu *tvcpu;
+       int err;
+       int subfunc;
+       struct kvmppc_vpa *vpap;
 
        tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
        if (!tvcpu)
                return H_PARAMETER;
 
-       flags >>= 63 - 18;
-       flags &= 7;
-       if (flags == 0 || flags == 4)
-               return H_PARAMETER;
-       if (flags < 4) {
-               if (vpa & 0x7f)
-                       return H_PARAMETER;
-               /* registering new area; convert logical addr to real */
-               pg_index = vpa >> kvm->arch.ram_porder;
-               pg_offset = vpa & (kvm->arch.ram_psize - 1);
-               if (pg_index >= kvm->arch.ram_npages)
+       subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
+       if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
+           subfunc == H_VPA_REG_SLB) {
+               /* Registering new area - address must be cache-line aligned */
+               if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
                        return H_PARAMETER;
-               if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+
+               /* convert logical addr to kernel addr and read length */
+               va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+               if (va == NULL)
                        return H_PARAMETER;
-               ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
-               ra |= pg_offset;
-               va = __va(ra);
-               if (flags <= 1)
-                       len = *(unsigned short *)(va + 4);
+               if (subfunc == H_VPA_REG_VPA)
+                       len = ((struct reg_vpa *)va)->length.hword;
                else
-                       len = *(unsigned int *)(va + 4);
-               if (pg_offset + len > kvm->arch.ram_psize)
+                       len = ((struct reg_vpa *)va)->length.word;
+               kvmppc_unpin_guest_page(kvm, va);
+
+               /* Check length */
+               if (len > nb || len < sizeof(struct reg_vpa))
                        return H_PARAMETER;
-               switch (flags) {
-               case 1:         /* register VPA */
-                       if (len < 640)
-                               return H_PARAMETER;
-                       tvcpu->arch.vpa = va;
-                       init_vpa(vcpu, va);
-                       break;
-               case 2:         /* register DTL */
-                       if (len < 48)
-                               return H_PARAMETER;
-                       if (!tvcpu->arch.vpa)
-                               return H_RESOURCE;
-                       len -= len % 48;
-                       tvcpu->arch.dtl = va;
-                       tvcpu->arch.dtl_end = va + len;
+       } else {
+               vpa = 0;
+               len = 0;
+       }
+
+       err = H_PARAMETER;
+       vpap = NULL;
+       spin_lock(&tvcpu->arch.vpa_update_lock);
+
+       switch (subfunc) {
+       case H_VPA_REG_VPA:             /* register VPA */
+               if (len < sizeof(struct lppaca))
                        break;
-               case 3:         /* register SLB shadow buffer */
-                       if (len < 8)
-                               return H_PARAMETER;
-                       if (!tvcpu->arch.vpa)
-                               return H_RESOURCE;
-                       tvcpu->arch.slb_shadow = va;
-                       len = (len - 16) / 16;
-                       tvcpu->arch.slb_shadow = va;
+               vpap = &tvcpu->arch.vpa;
+               err = 0;
+               break;
+
+       case H_VPA_REG_DTL:             /* register DTL */
+               if (len < sizeof(struct dtl_entry))
                        break;
-               }
-       } else {
-               switch (flags) {
-               case 5:         /* unregister VPA */
-                       if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
-                               return H_RESOURCE;
-                       tvcpu->arch.vpa = NULL;
+               len -= len % sizeof(struct dtl_entry);
+
+               /* Check that they have previously registered a VPA */
+               err = H_RESOURCE;
+               if (!vpa_is_registered(&tvcpu->arch.vpa))
                        break;
-               case 6:         /* unregister DTL */
-                       tvcpu->arch.dtl = NULL;
+
+               vpap = &tvcpu->arch.dtl;
+               err = 0;
+               break;
+
+       case H_VPA_REG_SLB:             /* register SLB shadow buffer */
+               /* Check that they have previously registered a VPA */
+               err = H_RESOURCE;
+               if (!vpa_is_registered(&tvcpu->arch.vpa))
                        break;
-               case 7:         /* unregister SLB shadow buffer */
-                       tvcpu->arch.slb_shadow = NULL;
+
+               vpap = &tvcpu->arch.slb_shadow;
+               err = 0;
+               break;
+
+       case H_VPA_DEREG_VPA:           /* deregister VPA */
+               /* Check they don't still have a DTL or SLB buf registered */
+               err = H_RESOURCE;
+               if (vpa_is_registered(&tvcpu->arch.dtl) ||
+                   vpa_is_registered(&tvcpu->arch.slb_shadow))
                        break;
+
+               vpap = &tvcpu->arch.vpa;
+               err = 0;
+               break;
+
+       case H_VPA_DEREG_DTL:           /* deregister DTL */
+               vpap = &tvcpu->arch.dtl;
+               err = 0;
+               break;
+
+       case H_VPA_DEREG_SLB:           /* deregister SLB shadow buffer */
+               vpap = &tvcpu->arch.slb_shadow;
+               err = 0;
+               break;
+       }
+
+       if (vpap) {
+               vpap->next_gpa = vpa;
+               vpap->len = len;
+               vpap->update_pending = 1;
+       }
+
+       spin_unlock(&tvcpu->arch.vpa_update_lock);
+
+       return err;
+}
+
+static void kvmppc_update_vpa(struct kvm *kvm, struct kvmppc_vpa *vpap)
+{
+       void *va;
+       unsigned long nb;
+
+       vpap->update_pending = 0;
+       va = NULL;
+       if (vpap->next_gpa) {
+               va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+               if (nb < vpap->len) {
+                       /*
+                        * If it's now too short, it must be that userspace
+                        * has changed the mappings underlying guest memory,
+                        * so unregister the region.
+                        */
+                       kvmppc_unpin_guest_page(kvm, va);
+                       va = NULL;
                }
        }
-       return H_SUCCESS;
+       if (vpap->pinned_addr)
+               kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
+       vpap->pinned_addr = va;
+       if (va)
+               vpap->pinned_end = va + vpap->len;
+}
+
+static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+
+       spin_lock(&vcpu->arch.vpa_update_lock);
+       if (vcpu->arch.vpa.update_pending) {
+               kvmppc_update_vpa(kvm, &vcpu->arch.vpa);
+               init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+       }
+       if (vcpu->arch.dtl.update_pending) {
+               kvmppc_update_vpa(kvm, &vcpu->arch.dtl);
+               vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
+               vcpu->arch.dtl_index = 0;
+       }
+       if (vcpu->arch.slb_shadow.update_pending)
+               kvmppc_update_vpa(kvm, &vcpu->arch.slb_shadow);
+       spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+                                   struct kvmppc_vcore *vc)
+{
+       struct dtl_entry *dt;
+       struct lppaca *vpa;
+       unsigned long old_stolen;
+
+       dt = vcpu->arch.dtl_ptr;
+       vpa = vcpu->arch.vpa.pinned_addr;
+       old_stolen = vcpu->arch.stolen_logged;
+       vcpu->arch.stolen_logged = vc->stolen_tb;
+       if (!dt || !vpa)
+               return;
+       memset(dt, 0, sizeof(struct dtl_entry));
+       dt->dispatch_reason = 7;
+       dt->processor_id = vc->pcpu + vcpu->arch.ptid;
+       dt->timebase = mftb();
+       dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen;
+       dt->srr0 = kvmppc_get_pc(vcpu);
+       dt->srr1 = vcpu->arch.shregs.msr;
+       ++dt;
+       if (dt == vcpu->arch.dtl.pinned_end)
+               dt = vcpu->arch.dtl.pinned_addr;
+       vcpu->arch.dtl_ptr = dt;
+       /* order writing *dt vs. writing vpa->dtl_idx */
+       smp_wmb();
+       vpa->dtl_idx = ++vcpu->arch.dtl_index;
 }
 
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -256,16 +349,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        struct kvm_vcpu *tvcpu;
 
        switch (req) {
+       case H_ENTER:
+               ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                             kvmppc_get_gpr(vcpu, 5),
+                                             kvmppc_get_gpr(vcpu, 6),
+                                             kvmppc_get_gpr(vcpu, 7));
+               break;
        case H_CEDE:
-               vcpu->arch.shregs.msr |= MSR_EE;
-               vcpu->arch.ceded = 1;
-               smp_mb();
-               if (!vcpu->arch.prodded)
-                       kvmppc_vcpu_block(vcpu);
-               else
-                       vcpu->arch.prodded = 0;
-               smp_mb();
-               vcpu->arch.ceded = 0;
                break;
        case H_PROD:
                target = kvmppc_get_gpr(vcpu, 4);
@@ -354,20 +444,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        }
        /*
-        * We get these next two if the guest does a bad real-mode access,
-        * as we have enabled VRMA (virtualized real mode area) mode in the
-        * LPCR.  We just generate an appropriate DSI/ISI to the guest.
+        * We get these next two if the guest accesses a page which it thinks
+        * it has mapped but which is not actually present, either because
+        * it is for an emulated I/O device or because the corresonding
+        * host page has been paged out.  Any other HDSI/HISI interrupts
+        * have been handled already.
         */
        case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-               vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr;
-               vcpu->arch.shregs.dar = vcpu->arch.fault_dar;
-               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
-               r = RESUME_GUEST;
+               r = kvmppc_book3s_hv_page_fault(run, vcpu,
+                               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
                break;
        case BOOK3S_INTERRUPT_H_INST_STORAGE:
-               kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
-                                       0x08000000);
-               r = RESUME_GUEST;
+               r = kvmppc_book3s_hv_page_fault(run, vcpu,
+                               kvmppc_get_pc(vcpu), 0);
                break;
        /*
         * This occurs if the guest executes an illegal instruction.
@@ -388,20 +477,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        }
 
-
-       if (!(r & RESUME_HOST)) {
-               /* To avoid clobbering exit_reason, only check for signals if
-                * we aren't already exiting to userspace for some other
-                * reason. */
-               if (signal_pending(tsk)) {
-                       vcpu->stat.signal_exits++;
-                       run->exit_reason = KVM_EXIT_INTR;
-                       r = -EINTR;
-               } else {
-                       kvmppc_core_deliver_interrupts(vcpu);
-               }
-       }
-
        return r;
 }
 
@@ -441,10 +516,45 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+       int r = -EINVAL;
+
+       switch (reg->id) {
+       case KVM_REG_PPC_HIOR:
+               r = put_user(0, (u64 __user *)reg->addr);
+               break;
+       default:
+               break;
+       }
+
+       return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+       int r = -EINVAL;
+
+       switch (reg->id) {
+       case KVM_REG_PPC_HIOR:
+       {
+               u64 hior;
+               /* Only allow this to be set to zero */
+               r = get_user(hior, (u64 __user *)reg->addr);
+               if (!r && (hior != 0))
+                       r = -EINVAL;
+               break;
+       }
+       default:
+               break;
+       }
+
+       return r;
+}
+
 int kvmppc_core_check_processor_compat(void)
 {
-       if (cpu_has_feature(CPU_FTR_HVMODE) &&
-           cpu_has_feature(CPU_FTR_ARCH_206))
+       if (cpu_has_feature(CPU_FTR_HVMODE))
                return 0;
        return -EIO;
 }
@@ -461,7 +571,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                goto out;
 
        err = -ENOMEM;
-       vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
        if (!vcpu)
                goto out;
 
@@ -476,17 +586,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
        /* default to host PVR, since we can't spoof it */
        vcpu->arch.pvr = mfspr(SPRN_PVR);
        kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+       spin_lock_init(&vcpu->arch.vpa_update_lock);
 
        kvmppc_mmu_book3s_hv_init(vcpu);
 
        /*
-        * Some vcpus may start out in stopped state.  If we initialize
-        * them to busy-in-host state they will stop other vcpus in the
-        * vcore from running.  Instead we initialize them to blocked
-        * state, effectively considering them to be stopped until we
-        * see the first run ioctl for them.
+        * We consider the vcpu stopped until we see the first run ioctl for it.
         */
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+       vcpu->arch.state = KVMPPC_VCPU_STOPPED;
 
        init_waitqueue_head(&vcpu->arch.cpu_run);
 
@@ -497,6 +604,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
                if (vcore) {
                        INIT_LIST_HEAD(&vcore->runnable_threads);
                        spin_lock_init(&vcore->lock);
+                       init_waitqueue_head(&vcore->wq);
+                       vcore->preempt_tb = mftb();
                }
                kvm->arch.vcores[core] = vcore;
        }
@@ -507,48 +616,60 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
        spin_lock(&vcore->lock);
        ++vcore->num_threads;
-       ++vcore->n_blocked;
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
+       vcpu->arch.stolen_logged = vcore->stolen_tb;
+
+       vcpu->arch.cpu_type = KVM_CPU_3S_64;
+       kvmppc_sanity_check(vcpu);
 
        return vcpu;
 
 free_vcpu:
-       kfree(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 out:
        return ERR_PTR(err);
 }
 
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
+       spin_lock(&vcpu->arch.vpa_update_lock);
+       if (vcpu->arch.dtl.pinned_addr)
+               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr);
+       if (vcpu->arch.slb_shadow.pinned_addr)
+               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
+       if (vcpu->arch.vpa.pinned_addr)
+               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
+       spin_unlock(&vcpu->arch.vpa_update_lock);
        kvm_vcpu_uninit(vcpu);
-       kfree(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
-static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       unsigned long dec_nsec, now;
 
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
-       ++vc->n_blocked;
-       if (vc->n_runnable > 0 &&
-           vc->n_runnable + vc->n_blocked == vc->num_threads) {
-               vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                                       arch.run_list);
-               wake_up(&vcpu->arch.cpu_run);
+       now = get_tb();
+       if (now > vcpu->arch.dec_expires) {
+               /* decrementer has already gone negative */
+               kvmppc_core_queue_dec(vcpu);
+               kvmppc_core_prepare_to_enter(vcpu);
+               return;
        }
-       spin_unlock(&vc->lock);
+       dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
+                  / tb_ticks_per_sec;
+       hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                     HRTIMER_MODE_REL);
+       vcpu->arch.timer_running = 1;
 }
 
-static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
-
-       spin_lock(&vc->lock);
-       vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
-       --vc->n_blocked;
-       spin_unlock(&vc->lock);
+       vcpu->arch.ceded = 0;
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -563,6 +684,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
                return;
        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
        --vc->n_runnable;
+       ++vc->n_busy;
        /* decrement the physical thread id of each following vcpu */
        v = vcpu;
        list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
@@ -570,22 +692,65 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
        list_del(&vcpu->arch.run_list);
 }
 
+static int kvmppc_grab_hwthread(int cpu)
+{
+       struct paca_struct *tpaca;
+       long timeout = 1000;
+
+       tpaca = &paca[cpu];
+
+       /* Ensure the thread won't go into the kernel if it wakes */
+       tpaca->kvm_hstate.hwthread_req = 1;
+
+       /*
+        * If the thread is already executing in the kernel (e.g. handling
+        * a stray interrupt), wait for it to get back to nap mode.
+        * The smp_mb() is to ensure that our setting of hwthread_req
+        * is visible before we look at hwthread_state, so if this
+        * races with the code at system_reset_pSeries and the thread
+        * misses our setting of hwthread_req, we are sure to see its
+        * setting of hwthread_state, and vice versa.
+        */
+       smp_mb();
+       while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
+               if (--timeout <= 0) {
+                       pr_err("KVM: couldn't grab cpu %d\n", cpu);
+                       return -EBUSY;
+               }
+               udelay(1);
+       }
+       return 0;
+}
+
+static void kvmppc_release_hwthread(int cpu)
+{
+       struct paca_struct *tpaca;
+
+       tpaca = &paca[cpu];
+       tpaca->kvm_hstate.hwthread_req = 0;
+       tpaca->kvm_hstate.kvm_vcpu = NULL;
+}
+
 static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 {
        int cpu;
        struct paca_struct *tpaca;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+       if (vcpu->arch.timer_running) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               vcpu->arch.timer_running = 0;
+       }
        cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
        tpaca->kvm_hstate.kvm_vcore = vc;
+       tpaca->kvm_hstate.napping = 0;
+       vcpu->cpu = vc->pcpu;
        smp_wmb();
-#ifdef CONFIG_PPC_ICP_NATIVE
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
        if (vcpu->arch.ptid) {
-               tpaca->cpu_start = 0x80;
-               tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST;
-               wmb();
+               kvmppc_grab_hwthread(cpu);
                xics_wake_cpu(cpu);
                ++vc->n_woken;
        }
@@ -632,9 +797,10 @@ static int on_primary_thread(void)
  */
 static int kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-       struct kvm_vcpu *vcpu, *vnext;
+       struct kvm_vcpu *vcpu, *vcpu0, *vnext;
        long ret;
        u64 now;
+       int ptid, i;
 
        /* don't start if any threads have a signal pending */
        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -653,29 +819,62 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
                goto out;
        }
 
+       /*
+        * Assign physical thread IDs, first to non-ceded vcpus
+        * and then to ceded ones.
+        */
+       ptid = 0;
+       vcpu0 = NULL;
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+               if (!vcpu->arch.ceded) {
+                       if (!ptid)
+                               vcpu0 = vcpu;
+                       vcpu->arch.ptid = ptid++;
+               }
+       }
+       if (!vcpu0)
+               return 0;               /* nothing to run */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               if (vcpu->arch.ceded)
+                       vcpu->arch.ptid = ptid++;
+
        vc->n_woken = 0;
        vc->nap_count = 0;
        vc->entry_exit_count = 0;
-       vc->vcore_running = 1;
+       vc->vcore_state = VCORE_RUNNING;
+       vc->stolen_tb += mftb() - vc->preempt_tb;
        vc->in_guest = 0;
        vc->pcpu = smp_processor_id();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+       vc->napping_threads = 0;
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
                kvmppc_start_thread(vcpu);
-       vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu,
-                               arch.run_list);
+               if (vcpu->arch.vpa.update_pending ||
+                   vcpu->arch.slb_shadow.update_pending ||
+                   vcpu->arch.dtl.update_pending)
+                       kvmppc_update_vpas(vcpu);
+               kvmppc_create_dtl_entry(vcpu, vc);
+       }
+       /* Grab any remaining hw threads so they can't go into the kernel */
+       for (i = ptid; i < threads_per_core; ++i)
+               kvmppc_grab_hwthread(vc->pcpu + i);
 
+       preempt_disable();
        spin_unlock(&vc->lock);
 
-       preempt_disable();
        kvm_guest_enter();
-       __kvmppc_vcore_entry(NULL, vcpu);
+       __kvmppc_vcore_entry(NULL, vcpu0);
+       for (i = 0; i < threads_per_core; ++i)
+               kvmppc_release_hwthread(vc->pcpu + i);
 
-       /* wait for secondary threads to finish writing their state to memory */
        spin_lock(&vc->lock);
+       /* disable sending of IPIs on virtual external irqs */
+       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+               vcpu->cpu = -1;
+       /* wait for secondary threads to finish writing their state to memory */
        if (vc->nap_count < vc->n_woken)
                kvmppc_wait_for_nap(vc);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
-       vc->vcore_running = 2;
+       vc->vcore_state = VCORE_EXITING;
        spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
@@ -691,22 +890,27 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
                if (now < vcpu->arch.dec_expires &&
                    kvmppc_core_pending_dec(vcpu))
                        kvmppc_core_dequeue_dec(vcpu);
-               if (!vcpu->arch.trap) {
-                       if (signal_pending(vcpu->arch.run_task)) {
-                               vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                       }
-                       continue;               /* didn't get to run */
-               }
-               ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
-                                        vcpu->arch.run_task);
+
+               ret = RESUME_GUEST;
+               if (vcpu->arch.trap)
+                       ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu,
+                                                vcpu->arch.run_task);
+
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
+
+               if (vcpu->arch.ceded) {
+                       if (ret != RESUME_GUEST)
+                               kvmppc_end_cede(vcpu);
+                       else
+                               kvmppc_set_timer(vcpu);
+               }
        }
 
        spin_lock(&vc->lock);
  out:
-       vc->vcore_running = 0;
+       vc->vcore_state = VCORE_INACTIVE;
+       vc->preempt_tb = mftb();
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
                if (vcpu->arch.ret != RESUME_GUEST) {
@@ -718,78 +922,132 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
        return 1;
 }
 
-static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 {
-       int ptid;
-       int wait_state;
-       struct kvmppc_vcore *vc;
        DEFINE_WAIT(wait);
 
-       /* No need to go into the guest when all we do is going out */
-       if (signal_pending(current)) {
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               return -EINTR;
+       prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+               schedule();
+       finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+       DEFINE_WAIT(wait);
+       struct kvm_vcpu *v;
+       int all_idle = 1;
+
+       prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+       vc->vcore_state = VCORE_SLEEPING;
+       spin_unlock(&vc->lock);
+       list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+               if (!v->arch.ceded || v->arch.pending_exceptions) {
+                       all_idle = 0;
+                       break;
+               }
        }
+       if (all_idle)
+               schedule();
+       finish_wait(&vc->wq, &wait);
+       spin_lock(&vc->lock);
+       vc->vcore_state = VCORE_INACTIVE;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int n_ceded;
+       int prev_state;
+       struct kvmppc_vcore *vc;
+       struct kvm_vcpu *v, *vn;
 
        kvm_run->exit_reason = 0;
        vcpu->arch.ret = RESUME_GUEST;
        vcpu->arch.trap = 0;
 
-       flush_fp_to_thread(current);
-       flush_altivec_to_thread(current);
-       flush_vsx_to_thread(current);
-
        /*
         * Synchronize with other threads in this virtual core
         */
        vc = vcpu->arch.vcore;
        spin_lock(&vc->lock);
-       /* This happens the first time this is called for a vcpu */
-       if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
-               --vc->n_blocked;
-       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
-       ptid = vc->n_runnable;
+       vcpu->arch.ceded = 0;
        vcpu->arch.run_task = current;
        vcpu->arch.kvm_run = kvm_run;
-       vcpu->arch.ptid = ptid;
+       prev_state = vcpu->arch.state;
+       vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
        list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
        ++vc->n_runnable;
 
-       wait_state = TASK_INTERRUPTIBLE;
-       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-               if (signal_pending(current)) {
-                       if (!vc->vcore_running) {
-                               kvm_run->exit_reason = KVM_EXIT_INTR;
-                               vcpu->arch.ret = -EINTR;
-                               break;
-                       }
-                       /* have to wait for vcore to stop executing guest */
-                       wait_state = TASK_UNINTERRUPTIBLE;
-                       smp_send_reschedule(vc->pcpu);
+       /*
+        * This happens the first time this is called for a vcpu.
+        * If the vcore is already running, we may be able to start
+        * this thread straight away and have it join in.
+        */
+       if (prev_state == KVMPPC_VCPU_STOPPED) {
+               if (vc->vcore_state == VCORE_RUNNING &&
+                   VCORE_EXIT_COUNT(vc) == 0) {
+                       vcpu->arch.ptid = vc->n_runnable - 1;
+                       kvmppc_start_thread(vcpu);
                }
 
-               if (!vc->vcore_running &&
-                   vc->n_runnable + vc->n_blocked == vc->num_threads) {
-                       /* we can run now */
-                       if (kvmppc_run_core(vc))
-                               continue;
-               }
+       } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
+               --vc->n_busy;
 
-               if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
-                       kvmppc_start_thread(vcpu);
+       while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+              !signal_pending(current)) {
+               if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+                       continue;
+               }
+               vc->runner = vcpu;
+               n_ceded = 0;
+               list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+                       n_ceded += v->arch.ceded;
+               if (n_ceded == vc->n_runnable)
+                       kvmppc_vcore_blocked(vc);
+               else
+                       kvmppc_run_core(vc);
+
+               list_for_each_entry_safe(v, vn, &vc->runnable_threads,
+                                        arch.run_list) {
+                       kvmppc_core_prepare_to_enter(v);
+                       if (signal_pending(v->arch.run_task)) {
+                               kvmppc_remove_runnable(vc, v);
+                               v->stat.signal_exits++;
+                               v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+                               v->arch.ret = -EINTR;
+                               wake_up(&v->arch.cpu_run);
+                       }
+               }
+               vc->runner = NULL;
+       }
 
-               /* wait for other threads to come in, or wait for vcore */
-               prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-               spin_unlock(&vc->lock);
-               schedule();
-               finish_wait(&vcpu->arch.cpu_run, &wait);
-               spin_lock(&vc->lock);
+       if (signal_pending(current)) {
+               if (vc->vcore_state == VCORE_RUNNING ||
+                   vc->vcore_state == VCORE_EXITING) {
+                       spin_unlock(&vc->lock);
+                       kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+                       spin_lock(&vc->lock);
+               }
+               if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+                       kvmppc_remove_runnable(vc, vcpu);
+                       vcpu->stat.signal_exits++;
+                       kvm_run->exit_reason = KVM_EXIT_INTR;
+                       vcpu->arch.ret = -EINTR;
+               }
        }
 
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
-               kvmppc_remove_runnable(vc, vcpu);
        spin_unlock(&vc->lock);
-
        return vcpu->arch.ret;
 }
 
@@ -797,135 +1055,54 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        int r;
 
+       if (!vcpu->arch.sane) {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               return -EINVAL;
+       }
+
+       kvmppc_core_prepare_to_enter(vcpu);
+
+       /* No need to go into the guest when all we'll do is come back out */
+       if (signal_pending(current)) {
+               run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* On the first time here, set up VRMA or RMA */
+       if (!vcpu->kvm->arch.rma_setup_done) {
+               r = kvmppc_hv_setup_rma(vcpu);
+               if (r)
+                       return r;
+       }
+
+       flush_fp_to_thread(current);
+       flush_altivec_to_thread(current);
+       flush_vsx_to_thread(current);
+       vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+       vcpu->arch.pgdir = current->mm->pgd;
+
        do {
                r = kvmppc_run_vcpu(run, vcpu);
 
                if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
                    !(vcpu->arch.shregs.msr & MSR_PR)) {
                        r = kvmppc_pseries_do_hcall(vcpu);
-                       kvmppc_core_deliver_interrupts(vcpu);
+                       kvmppc_core_prepare_to_enter(vcpu);
                }
        } while (r == RESUME_GUEST);
        return r;
 }
 
-static long kvmppc_stt_npages(unsigned long window_size)
-{
-       return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-                    * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
-}
-
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
-{
-       struct kvm *kvm = stt->kvm;
-       int i;
-
-       mutex_lock(&kvm->lock);
-       list_del(&stt->list);
-       for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
-               __free_page(stt->pages[i]);
-       kfree(stt);
-       mutex_unlock(&kvm->lock);
-
-       kvm_put_kvm(kvm);
-}
-
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-       struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
-       struct page *page;
-
-       if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
-               return VM_FAULT_SIGBUS;
-
-       page = stt->pages[vmf->pgoff];
-       get_page(page);
-       vmf->page = page;
-       return 0;
-}
-
-static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
-       .fault = kvm_spapr_tce_fault,
-};
-
-static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
-{
-       vma->vm_ops = &kvm_spapr_tce_vm_ops;
-       return 0;
-}
-
-static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
-{
-       struct kvmppc_spapr_tce_table *stt = filp->private_data;
-
-       release_spapr_tce_table(stt);
-       return 0;
-}
-
-static struct file_operations kvm_spapr_tce_fops = {
-       .mmap           = kvm_spapr_tce_mmap,
-       .release        = kvm_spapr_tce_release,
-};
-
-long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-                                  struct kvm_create_spapr_tce *args)
-{
-       struct kvmppc_spapr_tce_table *stt = NULL;
-       long npages;
-       int ret = -ENOMEM;
-       int i;
-
-       /* Check this LIOBN hasn't been previously allocated */
-       list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-               if (stt->liobn == args->liobn)
-                       return -EBUSY;
-       }
-
-       npages = kvmppc_stt_npages(args->window_size);
-
-       stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
-                     GFP_KERNEL);
-       if (!stt)
-               goto fail;
-
-       stt->liobn = args->liobn;
-       stt->window_size = args->window_size;
-       stt->kvm = kvm;
-
-       for (i = 0; i < npages; i++) {
-               stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
-               if (!stt->pages[i])
-                       goto fail;
-       }
-
-       kvm_get_kvm(kvm);
-
-       mutex_lock(&kvm->lock);
-       list_add(&stt->list, &kvm->arch.spapr_tce_tables);
-
-       mutex_unlock(&kvm->lock);
-
-       return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-                               stt, O_RDWR);
-
-fail:
-       if (stt) {
-               for (i = 0; i < npages; i++)
-                       if (stt->pages[i])
-                               __free_page(stt->pages[i]);
-
-               kfree(stt);
-       }
-       return ret;
-}
 
 /* Work out RMLS (real mode limit selector) field value for a given RMA size.
-   Assumes POWER7. */
+   Assumes POWER7 or PPC970. */
 static inline int lpcr_rmls(unsigned long rma_size)
 {
        switch (rma_size) {
        case 32ul << 20:        /* 32 MB */
-               return 8;
+               if (cpu_has_feature(CPU_FTR_ARCH_206))
+                       return 8;       /* only supported on POWER7 */
+               return -1;
        case 64ul << 20:        /* 64 MB */
                return 3;
        case 128ul << 20:       /* 128 MB */
@@ -945,7 +1122,7 @@ static inline int lpcr_rmls(unsigned long rma_size)
 
 static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       struct kvmppc_rma_info *ri = vma->vm_file->private_data;
+       struct kvmppc_linear_info *ri = vma->vm_file->private_data;
        struct page *page;
 
        if (vmf->pgoff >= ri->npages)
@@ -970,7 +1147,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int kvm_rma_release(struct inode *inode, struct file *filp)
 {
-       struct kvmppc_rma_info *ri = filp->private_data;
+       struct kvmppc_linear_info *ri = filp->private_data;
 
        kvm_release_rma(ri);
        return 0;
@@ -983,7 +1160,7 @@ static struct file_operations kvm_rma_fops = {
 
 long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
 {
-       struct kvmppc_rma_info *ri;
+       struct kvmppc_linear_info *ri;
        long fd;
 
        ri = kvm_alloc_rma();
@@ -998,136 +1175,269 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret)
        return fd;
 }
 
-static struct page *hva_to_page(unsigned long addr)
+static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+                                    int linux_psize)
+{
+       struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
+
+       if (!def->shift)
+               return;
+       (*sps)->page_shift = def->shift;
+       (*sps)->slb_enc = def->sllp;
+       (*sps)->enc[0].page_shift = def->shift;
+       (*sps)->enc[0].pte_enc = def->penc;
+       (*sps)++;
+}
+
+int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
 {
-       struct page *page[1];
-       int npages;
+       struct kvm_ppc_one_seg_page_size *sps;
 
-       might_sleep();
+       info->flags = KVM_PPC_PAGE_SIZES_REAL;
+       if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+               info->flags |= KVM_PPC_1T_SEGMENTS;
+       info->slb_size = mmu_slb_size;
 
-       npages = get_user_pages_fast(addr, 1, 1, page);
+       /* We only support these sizes for now, and no muti-size segments */
+       sps = &info->sps[0];
+       kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
+       kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
+       kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
 
-       if (unlikely(npages != 1))
-               return 0;
+       return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+       struct kvm_memory_slot *memslot;
+       int r;
+       unsigned long n;
+
+       mutex_lock(&kvm->slots_lock);
+
+       r = -EINVAL;
+       if (log->slot >= KVM_MEMORY_SLOTS)
+               goto out;
+
+       memslot = id_to_memslot(kvm->memslots, log->slot);
+       r = -ENOENT;
+       if (!memslot->dirty_bitmap)
+               goto out;
+
+       n = kvm_dirty_bitmap_bytes(memslot);
+       memset(memslot->dirty_bitmap, 0, n);
+
+       r = kvmppc_hv_get_dirty_log(kvm, memslot);
+       if (r)
+               goto out;
+
+       r = -EFAULT;
+       if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+               goto out;
+
+       r = 0;
+out:
+       mutex_unlock(&kvm->slots_lock);
+       return r;
+}
+
+static unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+       unsigned long senc = 0;
 
-       return page[0];
+       if (psize > 0x1000) {
+               senc = SLB_VSID_L;
+               if (psize == 0x10000)
+                       senc |= SLB_VSID_LP_01;
+       }
+       return senc;
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem)
 {
-       unsigned long psize, porder;
-       unsigned long i, npages, totalpages;
-       unsigned long pg_ix;
-       struct kvmppc_pginfo *pginfo;
-       unsigned long hva;
-       struct kvmppc_rma_info *ri = NULL;
+       unsigned long npages;
+       unsigned long *phys;
+
+       /* Allocate a slot_phys array */
+       phys = kvm->arch.slot_phys[mem->slot];
+       if (!kvm->arch.using_mmu_notifiers && !phys) {
+               npages = mem->memory_size >> PAGE_SHIFT;
+               phys = vzalloc(npages * sizeof(unsigned long));
+               if (!phys)
+                       return -ENOMEM;
+               kvm->arch.slot_phys[mem->slot] = phys;
+               kvm->arch.slot_npages[mem->slot] = npages;
+       }
+
+       return 0;
+}
+
+static void unpin_slot(struct kvm *kvm, int slot_id)
+{
+       unsigned long *physp;
+       unsigned long j, npages, pfn;
        struct page *page;
 
-       /* For now, only allow 16MB pages */
-       porder = LARGE_PAGE_ORDER;
-       psize = 1ul << porder;
-       if ((mem->memory_size & (psize - 1)) ||
-           (mem->guest_phys_addr & (psize - 1))) {
-               pr_err("bad memory_size=%llx @ %llx\n",
-                      mem->memory_size, mem->guest_phys_addr);
-               return -EINVAL;
+       physp = kvm->arch.slot_phys[slot_id];
+       npages = kvm->arch.slot_npages[slot_id];
+       if (physp) {
+               spin_lock(&kvm->arch.slot_phys_lock);
+               for (j = 0; j < npages; j++) {
+                       if (!(physp[j] & KVMPPC_GOT_PAGE))
+                               continue;
+                       pfn = physp[j] >> PAGE_SHIFT;
+                       page = pfn_to_page(pfn);
+                       if (PageHuge(page))
+                               page = compound_head(page);
+                       SetPageDirty(page);
+                       put_page(page);
+               }
+               kvm->arch.slot_phys[slot_id] = NULL;
+               spin_unlock(&kvm->arch.slot_phys_lock);
+               vfree(physp);
        }
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem)
+{
+}
+
+static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu)
+{
+       int err = 0;
+       struct kvm *kvm = vcpu->kvm;
+       struct kvmppc_linear_info *ri = NULL;
+       unsigned long hva;
+       struct kvm_memory_slot *memslot;
+       struct vm_area_struct *vma;
+       unsigned long lpcr, senc;
+       unsigned long psize, porder;
+       unsigned long rma_size;
+       unsigned long rmls;
+       unsigned long *physp;
+       unsigned long i, npages;
 
-       npages = mem->memory_size >> porder;
-       totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder;
+       mutex_lock(&kvm->lock);
+       if (kvm->arch.rma_setup_done)
+               goto out;       /* another vcpu beat us to it */
 
-       /* More memory than we have space to track? */
-       if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER)))
-               return -EINVAL;
+       /* Look up the memslot for guest physical address 0 */
+       memslot = gfn_to_memslot(kvm, 0);
 
-       /* Do we already have an RMA registered? */
-       if (mem->guest_phys_addr == 0 && kvm->arch.rma)
-               return -EINVAL;
+       /* We must have some memory at 0 by now */
+       err = -EINVAL;
+       if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+               goto out;
+
+       /* Look up the VMA for the start of this memory slot */
+       hva = memslot->userspace_addr;
+       down_read(&current->mm->mmap_sem);
+       vma = find_vma(current->mm, hva);
+       if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
+               goto up_out;
 
-       if (totalpages > kvm->arch.ram_npages)
-               kvm->arch.ram_npages = totalpages;
+       psize = vma_kernel_pagesize(vma);
+       porder = __ilog2(psize);
 
        /* Is this one of our preallocated RMAs? */
-       if (mem->guest_phys_addr == 0) {
-               struct vm_area_struct *vma;
-
-               down_read(&current->mm->mmap_sem);
-               vma = find_vma(current->mm, mem->userspace_addr);
-               if (vma && vma->vm_file &&
-                   vma->vm_file->f_op == &kvm_rma_fops &&
-                   mem->userspace_addr == vma->vm_start)
-                       ri = vma->vm_file->private_data;
-               up_read(&current->mm->mmap_sem);
-       }
+       if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops &&
+           hva == vma->vm_start)
+               ri = vma->vm_file->private_data;
+
+       up_read(&current->mm->mmap_sem);
+
+       if (!ri) {
+               /* On POWER7, use VRMA; on PPC970, give up */
+               err = -EPERM;
+               if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       pr_err("KVM: CPU requires an RMO\n");
+                       goto out;
+               }
+
+               /* We can handle 4k, 64k or 16M pages in the VRMA */
+               err = -EINVAL;
+               if (!(psize == 0x1000 || psize == 0x10000 ||
+                     psize == 0x1000000))
+                       goto out;
+
+               /* Update VRMASD field in the LPCR */
+               senc = slb_pgsize_encoding(psize);
+               kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+                       (VRMA_VSID << SLB_VSID_SHIFT_1T);
+               lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
+               lpcr |= senc << (LPCR_VRMASD_SH - 4);
+               kvm->arch.lpcr = lpcr;
 
-       if (ri) {
-               unsigned long rma_size;
-               unsigned long lpcr;
-               long rmls;
+               /* Create HPTEs in the hash page table for the VRMA */
+               kvmppc_map_vrma(vcpu, memslot, porder);
 
-               rma_size = ri->npages << PAGE_SHIFT;
-               if (rma_size > mem->memory_size)
-                       rma_size = mem->memory_size;
+       } else {
+               /* Set up to use an RMO region */
+               rma_size = ri->npages;
+               if (rma_size > memslot->npages)
+                       rma_size = memslot->npages;
+               rma_size <<= PAGE_SHIFT;
                rmls = lpcr_rmls(rma_size);
+               err = -EINVAL;
                if (rmls < 0) {
-                       pr_err("Can't use RMA of 0x%lx bytes\n", rma_size);
-                       return -EINVAL;
+                       pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
+                       goto out;
                }
                atomic_inc(&ri->use_count);
                kvm->arch.rma = ri;
-               kvm->arch.n_rma_pages = rma_size >> porder;
-               lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L);
-               lpcr |= rmls << LPCR_RMLS_SH;
+
+               /* Update LPCR and RMOR */
+               lpcr = kvm->arch.lpcr;
+               if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+                       /* PPC970; insert RMLS value (split field) in HID4 */
+                       lpcr &= ~((1ul << HID4_RMLS0_SH) |
+                                 (3ul << HID4_RMLS2_SH));
+                       lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) |
+                               ((rmls & 3) << HID4_RMLS2_SH);
+                       /* RMOR is also in HID4 */
+                       lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
+                               << HID4_RMOR_SH;
+               } else {
+                       /* POWER7 */
+                       lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L);
+                       lpcr |= rmls << LPCR_RMLS_SH;
+                       kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
+               }
                kvm->arch.lpcr = lpcr;
-               kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT;
-               pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n",
+               pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
                        ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
-       }
 
-       pg_ix = mem->guest_phys_addr >> porder;
-       pginfo = kvm->arch.ram_pginfo + pg_ix;
-       for (i = 0; i < npages; ++i, ++pg_ix) {
-               if (ri && pg_ix < kvm->arch.n_rma_pages) {
-                       pginfo[i].pfn = ri->base_pfn +
-                               (pg_ix << (porder - PAGE_SHIFT));
-                       continue;
-               }
-               hva = mem->userspace_addr + (i << porder);
-               page = hva_to_page(hva);
-               if (!page) {
-                       pr_err("oops, no pfn for hva %lx\n", hva);
-                       goto err;
-               }
-               /* Check it's a 16MB page */
-               if (!PageHead(page) ||
-                   compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) {
-                       pr_err("page at %lx isn't 16MB (o=%d)\n",
-                              hva, compound_order(page));
-                       goto err;
-               }
-               pginfo[i].pfn = page_to_pfn(page);
+               /* Initialize phys addrs of pages in RMO */
+               npages = ri->npages;
+               porder = __ilog2(npages);
+               physp = kvm->arch.slot_phys[memslot->id];
+               spin_lock(&kvm->arch.slot_phys_lock);
+               for (i = 0; i < npages; ++i)
+                       physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder;
+               spin_unlock(&kvm->arch.slot_phys_lock);
        }
 
-       return 0;
-
- err:
-       return -EINVAL;
-}
+       /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+       smp_wmb();
+       kvm->arch.rma_setup_done = 1;
+       err = 0;
+ out:
+       mutex_unlock(&kvm->lock);
+       return err;
 
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem)
-{
-       if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
-           !kvm->arch.rma)
-               kvmppc_map_vrma(kvm, mem);
+ up_out:
+       up_read(&current->mm->mmap_sem);
+       goto out;
 }
 
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
        long r;
-       unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER);
-       long err = -ENOMEM;
        unsigned long lpcr;
 
        /* Allocate hashed page table */
@@ -1137,46 +1447,43 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
 
-       kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo),
-                                      GFP_KERNEL);
-       if (!kvm->arch.ram_pginfo) {
-               pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n",
-                      npages * sizeof(struct kvmppc_pginfo));
-               goto out_free;
-       }
-
-       kvm->arch.ram_npages = 0;
-       kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER;
-       kvm->arch.ram_porder = LARGE_PAGE_ORDER;
        kvm->arch.rma = NULL;
-       kvm->arch.n_rma_pages = 0;
 
-       lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
-       lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
-               LPCR_VPM0 | LPCR_VRMA_L;
-       kvm->arch.lpcr = lpcr;
+       kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
 
+       if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+               /* PPC970; HID4 is effectively the LPCR */
+               unsigned long lpid = kvm->arch.lpid;
+               kvm->arch.host_lpid = 0;
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
+               lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
+               lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
+                       ((lpid & 0xf) << HID4_LPID5_SH);
+       } else {
+               /* POWER7; init LPCR for virtual RMA mode */
+               kvm->arch.host_lpid = mfspr(SPRN_LPID);
+               kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+               lpcr &= LPCR_PECE | LPCR_LPES;
+               lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+                       LPCR_VPM0 | LPCR_VPM1;
+               kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
+                       (VRMA_VSID << SLB_VSID_SHIFT_1T);
+       }
+       kvm->arch.lpcr = lpcr;
 
+       kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
+       spin_lock_init(&kvm->arch.slot_phys_lock);
        return 0;
-
- out_free:
-       kvmppc_free_hpt(kvm);
-       return err;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-       struct kvmppc_pginfo *pginfo;
        unsigned long i;
 
-       if (kvm->arch.ram_pginfo) {
-               pginfo = kvm->arch.ram_pginfo;
-               kvm->arch.ram_pginfo = NULL;
-               for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i)
-                       if (pginfo[i].pfn)
-                               put_page(pfn_to_page(pginfo[i].pfn));
-               kfree(pginfo);
-       }
+       if (!kvm->arch.using_mmu_notifiers)
+               for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+                       unpin_slot(kvm, i);
+
        if (kvm->arch.rma) {
                kvm_release_rma(kvm->arch.rma);
                kvm->arch.rma = NULL;
@@ -1198,12 +1505,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 {
        return EMULATE_FAIL;
 }
 
-int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
 {
        return EMULATE_FAIL;
 }