KVM: PPC: Implement MMU notifiers for Book3S HV guests
[linux-2.6.git] / arch / powerpc / kvm / book3s_64_mmu_hv.c
index 2d31519..83761dd 100644 (file)
@@ -281,8 +281,9 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 }
 
 /*
- * We come here on a H_ENTER call from the guest when
- * we don't have the requested page pinned already.
+ * We come here on a H_ENTER call from the guest when we are not
+ * using mmu notifiers and we don't have the requested page pinned
+ * already.
  */
 long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                        long pte_index, unsigned long pteh, unsigned long ptel)
@@ -292,6 +293,9 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
        struct kvm_memory_slot *memslot;
        long ret;
 
+       if (kvm->arch.using_mmu_notifiers)
+               goto do_insert;
+
        psize = hpte_page_size(pteh, ptel);
        if (!psize)
                return H_PARAMETER;
@@ -309,9 +313,12 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                        return H_PARAMETER;
        }
 
-       preempt_disable();
+ do_insert:
+       /* Protect linux PTE lookup from page table destruction */
+       rcu_read_lock_sched();  /* this disables preemption too */
+       vcpu->arch.pgdir = current->mm->pgd;
        ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
-       preempt_enable();
+       rcu_read_unlock_sched();
        if (ret == H_TOO_HARD) {
                /* this can't happen */
                pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
@@ -487,12 +494,16 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned long ea, unsigned long dsisr)
 {
        struct kvm *kvm = vcpu->kvm;
-       unsigned long *hptep, hpte[3];
-       unsigned long psize;
-       unsigned long gfn;
+       unsigned long *hptep, hpte[3], r;
+       unsigned long mmu_seq, psize, pte_size;
+       unsigned long gfn, hva, pfn;
        struct kvm_memory_slot *memslot;
+       unsigned long *rmap;
        struct revmap_entry *rev;
-       long index;
+       struct page *page, *pages[1];
+       long index, ret, npages;
+       unsigned long is_io;
+       struct vm_area_struct *vma;
 
        /*
         * Real-mode code has already searched the HPT and found the
@@ -510,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                cpu_relax();
        hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
        hpte[1] = hptep[1];
-       hpte[2] = rev->guest_rpte;
+       hpte[2] = r = rev->guest_rpte;
        asm volatile("lwsync" : : : "memory");
        hptep[0] = hpte[0];
        preempt_enable();
@@ -520,8 +531,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                return RESUME_GUEST;
 
        /* Translate the logical address and get the page */
-       psize = hpte_page_size(hpte[0], hpte[1]);
-       gfn = hpte_rpn(hpte[2], psize);
+       psize = hpte_page_size(hpte[0], r);
+       gfn = hpte_rpn(r, psize);
        memslot = gfn_to_memslot(kvm, gfn);
 
        /* No memslot means it's an emulated MMIO region */
@@ -531,8 +542,228 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                              dsisr & DSISR_ISSTORE);
        }
 
-       /* should never get here otherwise */
-       return -EFAULT;
+       if (!kvm->arch.using_mmu_notifiers)
+               return -EFAULT;         /* should never get here */
+
+       /* used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       is_io = 0;
+       pfn = 0;
+       page = NULL;
+       pte_size = PAGE_SIZE;
+       hva = gfn_to_hva_memslot(memslot, gfn);
+       npages = get_user_pages_fast(hva, 1, 1, pages);
+       if (npages < 1) {
+               /* Check if it's an I/O mapping */
+               down_read(&current->mm->mmap_sem);
+               vma = find_vma(current->mm, hva);
+               if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
+                   (vma->vm_flags & VM_PFNMAP)) {
+                       pfn = vma->vm_pgoff +
+                               ((hva - vma->vm_start) >> PAGE_SHIFT);
+                       pte_size = psize;
+                       is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+               }
+               up_read(&current->mm->mmap_sem);
+               if (!pfn)
+                       return -EFAULT;
+       } else {
+               page = pages[0];
+               if (PageHuge(page)) {
+                       page = compound_head(page);
+                       pte_size <<= compound_order(page);
+               }
+               pfn = page_to_pfn(page);
+       }
+
+       ret = -EFAULT;
+       if (psize > pte_size)
+               goto out_put;
+
+       /* Check WIMG vs. the actual page we're accessing */
+       if (!hpte_cache_flags_ok(r, is_io)) {
+               if (is_io)
+                       return -EFAULT;
+               /*
+                * Allow guest to map emulated device memory as
+                * uncacheable, but actually make it cacheable.
+                */
+               r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
+       }
+
+       /* Set the HPTE to point to pfn */
+       r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
+       ret = RESUME_GUEST;
+       preempt_disable();
+       while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+               cpu_relax();
+       if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
+           rev->guest_rpte != hpte[2])
+               /* HPTE has been changed under us; let the guest retry */
+               goto out_unlock;
+       hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+
+       rmap = &memslot->rmap[gfn - memslot->base_gfn];
+       lock_rmap(rmap);
+
+       /* Check if we might have been invalidated; let the guest retry if so */
+       ret = RESUME_GUEST;
+       if (mmu_notifier_retry(vcpu, mmu_seq)) {
+               unlock_rmap(rmap);
+               goto out_unlock;
+       }
+       kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
+
+       hptep[1] = r;
+       eieio();
+       hptep[0] = hpte[0];
+       asm volatile("ptesync" : : : "memory");
+       preempt_enable();
+       if (page)
+               SetPageDirty(page);
+
+ out_put:
+       if (page)
+               put_page(page);
+       return ret;
+
+ out_unlock:
+       hptep[0] &= ~HPTE_V_HVLOCK;
+       preempt_enable();
+       goto out_put;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+                                        unsigned long gfn))
+{
+       int ret;
+       int retval = 0;
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+
+       slots = kvm_memslots(kvm);
+       kvm_for_each_memslot(memslot, slots) {
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
+                       ret = handler(kvm, &memslot->rmap[gfn_offset],
+                                     memslot->base_gfn + gfn_offset);
+                       retval |= ret;
+               }
+       }
+
+       return retval;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                          unsigned long gfn)
+{
+       struct revmap_entry *rev = kvm->arch.revmap;
+       unsigned long h, i, j;
+       unsigned long *hptep;
+       unsigned long ptel, psize;
+
+       for (;;) {
+               while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+                       cpu_relax();
+               if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+                       __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+                       break;
+               }
+
+               /*
+                * To avoid an ABBA deadlock with the HPTE lock bit,
+                * we have to unlock the rmap chain before locking the HPTE.
+                * Thus we remove the first entry, unlock the rmap chain,
+                * lock the HPTE and then check that it is for the
+                * page we're unmapping before changing it to non-present.
+                */
+               i = *rmapp & KVMPPC_RMAP_INDEX;
+               j = rev[i].forw;
+               if (j == i) {
+                       /* chain is now empty */
+                       j = 0;
+               } else {
+                       /* remove i from chain */
+                       h = rev[i].back;
+                       rev[h].forw = j;
+                       rev[j].back = h;
+                       rev[i].forw = rev[i].back = i;
+                       j |= KVMPPC_RMAP_PRESENT;
+               }
+               smp_wmb();
+               *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT);
+
+               /* Now lock, check and modify the HPTE */
+               hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+               while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+                       cpu_relax();
+               ptel = rev[i].guest_rpte;
+               psize = hpte_page_size(hptep[0], ptel);
+               if ((hptep[0] & HPTE_V_VALID) &&
+                   hpte_rpn(ptel, psize) == gfn) {
+                       kvmppc_invalidate_hpte(kvm, hptep, i);
+                       hptep[0] |= HPTE_V_ABSENT;
+               }
+               hptep[0] &= ~HPTE_V_HVLOCK;
+       }
+       return 0;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       if (kvm->arch.using_mmu_notifiers)
+               kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+       return 0;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                        unsigned long gfn)
+{
+       if (!kvm->arch.using_mmu_notifiers)
+               return 0;
+       if (!(*rmapp & KVMPPC_RMAP_REFERENCED))
+               return 0;
+       kvm_unmap_rmapp(kvm, rmapp, gfn);
+       while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+               cpu_relax();
+       __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp);
+       __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+       return 1;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       if (!kvm->arch.using_mmu_notifiers)
+               return 0;
+       return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                             unsigned long gfn)
+{
+       return !!(*rmapp & KVMPPC_RMAP_REFERENCED);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       if (!kvm->arch.using_mmu_notifiers)
+               return 0;
+       return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       if (!kvm->arch.using_mmu_notifiers)
+               return;
+       kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 }
 
 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
@@ -540,31 +771,42 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 {
        struct kvm_memory_slot *memslot;
        unsigned long gfn = gpa >> PAGE_SHIFT;
-       struct page *page;
-       unsigned long psize, offset;
+       struct page *page, *pages[1];
+       int npages;
+       unsigned long hva, psize, offset;
        unsigned long pa;
        unsigned long *physp;
 
        memslot = gfn_to_memslot(kvm, gfn);
        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
                return NULL;
-       physp = kvm->arch.slot_phys[memslot->id];
-       if (!physp)
-               return NULL;
-       physp += gfn - memslot->base_gfn;
-       pa = *physp;
-       if (!pa) {
-               if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0)
+       if (!kvm->arch.using_mmu_notifiers) {
+               physp = kvm->arch.slot_phys[memslot->id];
+               if (!physp)
                        return NULL;
+               physp += gfn - memslot->base_gfn;
                pa = *physp;
+               if (!pa) {
+                       if (kvmppc_get_guest_page(kvm, gfn, memslot,
+                                                 PAGE_SIZE) < 0)
+                               return NULL;
+                       pa = *physp;
+               }
+               page = pfn_to_page(pa >> PAGE_SHIFT);
+       } else {
+               hva = gfn_to_hva_memslot(memslot, gfn);
+               npages = get_user_pages_fast(hva, 1, 1, pages);
+               if (npages < 1)
+                       return NULL;
+               page = pages[0];
        }
-       page = pfn_to_page(pa >> PAGE_SHIFT);
        psize = PAGE_SIZE;
        if (PageHuge(page)) {
                page = compound_head(page);
                psize <<= compound_order(page);
        }
-       get_page(page);
+       if (!kvm->arch.using_mmu_notifiers)
+               get_page(page);
        offset = gpa & (psize - 1);
        if (nb_ret)
                *nb_ret = psize - offset;