]> nv-tegra.nvidia Code Review - linux-3.10.git/blobdiff - virt/kvm/kvm_main.c
KVM: fix missing check for memslot flags
[linux-3.10.git] / virt / kvm / kvm_main.c
index 556e3efe532523df4f3bc138ba62605c253fd977..7b94d70a323fb921596e6b6bd01a7dc4ed7ac644 100644 (file)
@@ -47,6 +47,8 @@
 #include <linux/srcu.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/bsearch.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -84,6 +86,10 @@ struct dentry *kvm_debugfs_dir;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
+                                 unsigned long arg);
+#endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
 
@@ -94,13 +100,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-static struct page *fault_page;
-static pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
 {
        if (pfn_valid(pfn)) {
                int reserved;
@@ -197,7 +197,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-       int dirty_count = kvm->tlbs_dirty;
+       long dirty_count = kvm->tlbs_dirty;
 
        smp_mb();
        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        }
        vcpu->run = page_address(page);
 
+       kvm_vcpu_set_in_spin_loop(vcpu, false);
+       kvm_vcpu_set_dy_eligible(vcpu, false);
+
        r = kvm_arch_vcpu_init(vcpu);
        if (r < 0)
                goto fail_free_run;
@@ -283,15 +286,15 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
         */
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
+
        kvm->mmu_notifier_seq++;
        need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
-       spin_unlock(&kvm->mmu_lock);
-       srcu_read_unlock(&kvm->srcu, idx);
-
        /* we've to flush the tlb before the pages can be freed */
        if (need_tlb_flush)
                kvm_flush_remote_tlbs(kvm);
 
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
 }
 
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
@@ -326,15 +329,14 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_notifier_count++;
-       for (; start < end; start += PAGE_SIZE)
-               need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
        need_tlb_flush |= kvm->tlbs_dirty;
-       spin_unlock(&kvm->mmu_lock);
-       srcu_read_unlock(&kvm->srcu, idx);
-
        /* we've to flush the tlb before the pages can be freed */
        if (need_tlb_flush)
                kvm_flush_remote_tlbs(kvm);
+
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -351,11 +353,11 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
         * been freed.
         */
        kvm->mmu_notifier_seq++;
+       smp_wmb();
        /*
         * The above sequence increase must be visible before the
-        * below count decrease but both values are read by the kvm
-        * page fault under mmu_lock spinlock so we don't need to add
-        * a smb_wmb() here in between the two.
+        * below count decrease, which is ensured by the smp_wmb above
+        * in conjunction with the smp_rmb in mmu_notifier_retry().
         */
        kvm->mmu_notifier_count--;
        spin_unlock(&kvm->mmu_lock);
@@ -372,13 +374,14 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 
        idx = srcu_read_lock(&kvm->srcu);
        spin_lock(&kvm->mmu_lock);
-       young = kvm_age_hva(kvm, address);
-       spin_unlock(&kvm->mmu_lock);
-       srcu_read_unlock(&kvm->srcu, idx);
 
+       young = kvm_age_hva(kvm, address);
        if (young)
                kvm_flush_remote_tlbs(kvm);
 
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
        return young;
 }
 
@@ -434,7 +437,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 
-static struct kvm *kvm_create_vm(void)
+static void kvm_init_memslots_id(struct kvm *kvm)
+{
+       int i;
+       struct kvm_memslots *slots = kvm->memslots;
+
+       for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+               slots->id_to_index[i] = slots->memslots[i].id = i;
+}
+
+static struct kvm *kvm_create_vm(unsigned long type)
 {
        int r, i;
        struct kvm *kvm = kvm_arch_alloc_vm();
@@ -442,7 +454,7 @@ static struct kvm *kvm_create_vm(void)
        if (!kvm)
                return ERR_PTR(-ENOMEM);
 
-       r = kvm_arch_init_vm(kvm);
+       r = kvm_arch_init_vm(kvm, type);
        if (r)
                goto out_err_nodisable;
 
@@ -459,6 +471,7 @@ static struct kvm *kvm_create_vm(void)
        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
        if (!kvm->memslots)
                goto out_err_nosrcu;
+       kvm_init_memslots_id(kvm);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_nosrcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
@@ -468,18 +481,19 @@ static struct kvm *kvm_create_vm(void)
                        goto out_err;
        }
 
-       r = kvm_init_mmu_notifier(kvm);
-       if (r)
-               goto out_err;
-
+       spin_lock_init(&kvm->mmu_lock);
        kvm->mm = current->mm;
        atomic_inc(&kvm->mm->mm_count);
-       spin_lock_init(&kvm->mmu_lock);
        kvm_eventfd_init(kvm);
        mutex_init(&kvm->lock);
        mutex_init(&kvm->irq_lock);
        mutex_init(&kvm->slots_lock);
        atomic_set(&kvm->users_count, 1);
+
+       r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err;
+
        raw_spin_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        raw_spin_unlock(&kvm_lock);
@@ -498,18 +512,33 @@ out_err_nodisable:
        return ERR_PTR(r);
 }
 
+/*
+ * Avoid using vmalloc for a small buffer.
+ * Should not be used when the size is statically known.
+ */
+void *kvm_kvzalloc(unsigned long size)
+{
+       if (size > PAGE_SIZE)
+               return vzalloc(size);
+       else
+               return kzalloc(size, GFP_KERNEL);
+}
+
+void kvm_kvfree(const void *addr)
+{
+       if (is_vmalloc_addr(addr))
+               vfree(addr);
+       else
+               kfree(addr);
+}
+
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
        if (!memslot->dirty_bitmap)
                return;
 
-       if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
-               vfree(memslot->dirty_bitmap_head);
-       else
-               kfree(memslot->dirty_bitmap_head);
-
+       kvm_kvfree(memslot->dirty_bitmap);
        memslot->dirty_bitmap = NULL;
-       memslot->dirty_bitmap_head = NULL;
 }
 
 /*
@@ -518,33 +547,21 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
                                  struct kvm_memory_slot *dont)
 {
-       int i;
-
-       if (!dont || free->rmap != dont->rmap)
-               vfree(free->rmap);
-
        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
                kvm_destroy_dirty_bitmap(free);
 
-
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
-                       vfree(free->lpage_info[i]);
-                       free->lpage_info[i] = NULL;
-               }
-       }
+       kvm_arch_free_memslot(free, dont);
 
        free->npages = 0;
-       free->rmap = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
 {
-       int i;
        struct kvm_memslots *slots = kvm->memslots;
+       struct kvm_memory_slot *memslot;
 
-       for (i = 0; i < slots->nmemslots; ++i)
-               kvm_free_physmem_slot(&slots->memslots[i], NULL);
+       kvm_for_each_memslot(memslot, slots)
+               kvm_free_physmem_slot(memslot, NULL);
 
        kfree(kvm->memslots);
 }
@@ -599,28 +616,75 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
        return 0;
 }
 
-#ifndef CONFIG_S390
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
- * This makes it possible to do double buffering: see x86's
- * kvm_vm_ioctl_get_dirty_log().
+ * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
+#ifndef CONFIG_S390
        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
-       if (dirty_bytes > PAGE_SIZE)
-               memslot->dirty_bitmap = vzalloc(dirty_bytes);
-       else
-               memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
-
+       memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
        if (!memslot->dirty_bitmap)
                return -ENOMEM;
 
-       memslot->dirty_bitmap_head = memslot->dirty_bitmap;
+#endif /* !CONFIG_S390 */
+       return 0;
+}
+
+static int cmp_memslot(const void *slot1, const void *slot2)
+{
+       struct kvm_memory_slot *s1, *s2;
+
+       s1 = (struct kvm_memory_slot *)slot1;
+       s2 = (struct kvm_memory_slot *)slot2;
+
+       if (s1->npages < s2->npages)
+               return 1;
+       if (s1->npages > s2->npages)
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Sort the memslots base on its size, so the larger slots
+ * will get better fit.
+ */
+static void sort_memslots(struct kvm_memslots *slots)
+{
+       int i;
+
+       sort(slots->memslots, KVM_MEM_SLOTS_NUM,
+             sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
+
+       for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+               slots->id_to_index[slots->memslots[i].id] = i;
+}
+
+void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
+{
+       if (new) {
+               int id = new->id;
+               struct kvm_memory_slot *old = id_to_memslot(slots, id);
+               unsigned long npages = old->npages;
+
+               *old = *new;
+               if (new->npages != npages)
+                       sort_memslots(slots);
+       }
+
+       slots->generation++;
+}
+
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+       if (mem->flags & ~KVM_MEM_LOG_DIRTY_PAGES)
+               return -EINVAL;
+
        return 0;
 }
-#endif /* !CONFIG_S390 */
 
 /*
  * Allocate some memory and give it an address in the guest physical address
@@ -642,20 +706,29 @@ int __kvm_set_memory_region(struct kvm *kvm,
        struct kvm_memory_slot old, new;
        struct kvm_memslots *slots, *old_memslots;
 
+       r = check_memory_region_flags(mem);
+       if (r)
+               goto out;
+
        r = -EINVAL;
        /* General sanity checks */
        if (mem->memory_size & (PAGE_SIZE - 1))
                goto out;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                goto out;
-       if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+       /* We can read the guest memory with __xxx_user() later on. */
+       if (user_alloc &&
+           ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+            !access_ok(VERIFY_WRITE,
+                       (void __user *)(unsigned long)mem->userspace_addr,
+                       mem->memory_size)))
                goto out;
-       if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+       if (mem->slot >= KVM_MEM_SLOTS_NUM)
                goto out;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
                goto out;
 
-       memslot = &kvm->memslots->memslots[mem->slot];
+       memslot = id_to_memslot(kvm->memslots, mem->slot);
        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
        npages = mem->memory_size >> PAGE_SHIFT;
 
@@ -697,80 +770,33 @@ int __kvm_set_memory_region(struct kvm *kvm,
        r = -ENOMEM;
 
        /* Allocate if a slot is being created */
-#ifndef CONFIG_S390
-       if (npages && !new.rmap) {
-               new.rmap = vzalloc(npages * sizeof(*new.rmap));
-
-               if (!new.rmap)
-                       goto out_free;
-
+       if (npages && !old.npages) {
                new.user_alloc = user_alloc;
                new.userspace_addr = mem->userspace_addr;
-       }
-       if (!npages)
-               goto skip_lpage;
-
-       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
-               unsigned long ugfn;
-               unsigned long j;
-               int lpages;
-               int level = i + 2;
 
-               /* Avoid unused variable warning if no large pages */
-               (void)level;
-
-               if (new.lpage_info[i])
-                       continue;
-
-               lpages = 1 + ((base_gfn + npages - 1)
-                            >> KVM_HPAGE_GFN_SHIFT(level));
-               lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
-
-               new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
-
-               if (!new.lpage_info[i])
+               if (kvm_arch_create_memslot(&new, npages))
                        goto out_free;
-
-               if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       new.lpage_info[i][0].write_count = 1;
-               if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-                       new.lpage_info[i][lpages - 1].write_count = 1;
-               ugfn = new.userspace_addr >> PAGE_SHIFT;
-               /*
-                * If the gfn and userspace address are not aligned wrt each
-                * other, or if explicitly asked to, disable large page
-                * support for this slot
-                */
-               if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
-                   !largepages_enabled)
-                       for (j = 0; j < lpages; ++j)
-                               new.lpage_info[i][j].write_count = 1;
        }
 
-skip_lpage:
-
        /* Allocate page dirty bitmap if needed */
        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
                if (kvm_create_dirty_bitmap(&new) < 0)
                        goto out_free;
                /* destroy any largepage mappings for dirty tracking */
        }
-#else  /* not defined CONFIG_S390 */
-       new.user_alloc = user_alloc;
-       if (user_alloc)
-               new.userspace_addr = mem->userspace_addr;
-#endif /* not defined CONFIG_S390 */
 
        if (!npages) {
+               struct kvm_memory_slot *slot;
+
                r = -ENOMEM;
-               slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+               slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+                               GFP_KERNEL);
                if (!slots)
                        goto out_free;
-               memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-               if (mem->slot >= slots->nmemslots)
-                       slots->nmemslots = mem->slot + 1;
-               slots->generation++;
-               slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID;
+               slot = id_to_memslot(slots, mem->slot);
+               slot->flags |= KVM_MEMSLOT_INVALID;
+
+               update_memslots(slots, NULL);
 
                old_memslots = kvm->memslots;
                rcu_assign_pointer(kvm->memslots, slots);
@@ -790,37 +816,40 @@ skip_lpage:
        if (r)
                goto out_free;
 
-       /* map the pages in iommu page table */
+       /* map/unmap the pages in iommu page table */
        if (npages) {
                r = kvm_iommu_map_pages(kvm, &new);
                if (r)
                        goto out_free;
-       }
+       } else
+               kvm_iommu_unmap_pages(kvm, &old);
 
        r = -ENOMEM;
-       slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+       slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
+                       GFP_KERNEL);
        if (!slots)
                goto out_free;
-       memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-       if (mem->slot >= slots->nmemslots)
-               slots->nmemslots = mem->slot + 1;
-       slots->generation++;
 
        /* actual memory is freed via old in kvm_free_physmem_slot below */
        if (!npages) {
-               new.rmap = NULL;
                new.dirty_bitmap = NULL;
-               for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i)
-                       new.lpage_info[i] = NULL;
+               memset(&new.arch, 0, sizeof(new.arch));
        }
 
-       slots->memslots[mem->slot] = new;
+       update_memslots(slots, &new);
        old_memslots = kvm->memslots;
        rcu_assign_pointer(kvm->memslots, slots);
        synchronize_srcu_expedited(&kvm->srcu);
 
        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
+       /*
+        * If the new memory slot is created, we need to clear all
+        * mmio sptes.
+        */
+       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
+               kvm_arch_flush_shadow(kvm);
+
        kvm_free_physmem_slot(&old, &new);
        kfree(old_memslots);
 
@@ -869,7 +898,7 @@ int kvm_get_dirty_log(struct kvm *kvm,
        if (log->slot >= KVM_MEMORY_SLOTS)
                goto out;
 
-       memslot = &kvm->memslots->memslots[log->slot];
+       memslot = id_to_memslot(kvm->memslots, log->slot);
        r = -ENOENT;
        if (!memslot->dirty_bitmap)
                goto out;
@@ -891,35 +920,16 @@ out:
        return r;
 }
 
-void kvm_disable_largepages(void)
+bool kvm_largepages_enabled(void)
 {
-       largepages_enabled = false;
+       return largepages_enabled;
 }
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
-int is_error_page(struct page *page)
-{
-       return page == bad_page || page == hwpoison_page || page == fault_page;
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-int is_error_pfn(pfn_t pfn)
-{
-       return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
-       return pfn == hwpoison_pfn;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
-int is_fault_pfn(pfn_t pfn)
+void kvm_disable_largepages(void)
 {
-       return pfn == fault_pfn;
+       largepages_enabled = false;
 }
-EXPORT_SYMBOL_GPL(is_fault_pfn);
+EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 static inline unsigned long bad_hva(void)
 {
@@ -932,21 +942,6 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm_memslots *slots,
-                                               gfn_t gfn)
-{
-       int i;
-
-       for (i = 0; i < slots->nmemslots; ++i) {
-               struct kvm_memory_slot *memslot = &slots->memslots[i];
-
-               if (gfn >= memslot->base_gfn
-                   && gfn < memslot->base_gfn + memslot->npages)
-                       return memslot;
-       }
-       return NULL;
-}
-
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -955,20 +950,13 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
-       int i;
-       struct kvm_memslots *slots = kvm_memslots(kvm);
-
-       for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
-               struct kvm_memory_slot *memslot = &slots->memslots[i];
+       struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 
-               if (memslot->flags & KVM_MEMSLOT_INVALID)
-                       continue;
+       if (!memslot || memslot->id >= KVM_MEMORY_SLOTS ||
+             memslot->flags & KVM_MEMSLOT_INVALID)
+               return 0;
 
-               if (gfn >= memslot->base_gfn
-                   && gfn < memslot->base_gfn + memslot->npages)
-                       return 1;
-       }
-       return 0;
+       return 1;
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
@@ -996,23 +984,6 @@ out:
        return size;
 }
 
-int memslot_id(struct kvm *kvm, gfn_t gfn)
-{
-       int i;
-       struct kvm_memslots *slots = kvm_memslots(kvm);
-       struct kvm_memory_slot *memslot = NULL;
-
-       for (i = 0; i < slots->nmemslots; ++i) {
-               memslot = &slots->memslots[i];
-
-               if (gfn >= memslot->base_gfn
-                   && gfn < memslot->base_gfn + memslot->npages)
-                       break;
-       }
-
-       return memslot - slots->memslots;
-}
-
 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
                                     gfn_t *nr_pages)
 {
@@ -1031,10 +1002,15 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t get_fault_pfn(void)
+int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
+       unsigned long start, int write, struct page **page)
 {
-       get_page(fault_page);
-       return fault_pfn;
+       int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
+
+       if (write)
+               flags |= FOLL_WRITE;
+
+       return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
 static inline int check_user_page_hwpoison(unsigned long addr)
@@ -1046,8 +1022,8 @@ static inline int check_user_page_hwpoison(unsigned long addr)
        return rc == -EHWPOISON;
 }
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-                       bool *async, bool write_fault, bool *writable)
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+                       bool write_fault, bool *writable)
 {
        struct page *page[1];
        int npages = 0;
@@ -1070,7 +1046,14 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
                if (writable)
                        *writable = write_fault;
 
-               npages = get_user_pages_fast(addr, 1, write_fault, page);
+               if (async) {
+                       down_read(&current->mm->mmap_sem);
+                       npages = get_user_page_nowait(current, current->mm,
+                                                    addr, write_fault, page);
+                       up_read(&current->mm->mmap_sem);
+               } else
+                       npages = get_user_pages_fast(addr, 1, write_fault,
+                                                    page);
 
                /* map read fault as writable if possible */
                if (unlikely(!write_fault) && npages == 1) {
@@ -1090,19 +1073,19 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
                struct vm_area_struct *vma;
 
                if (atomic)
-                       return get_fault_pfn();
+                       return KVM_PFN_ERR_FAULT;
 
                down_read(&current->mm->mmap_sem);
-               if (check_user_page_hwpoison(addr)) {
+               if (npages == -EHWPOISON ||
+                       (!async && check_user_page_hwpoison(addr))) {
                        up_read(&current->mm->mmap_sem);
-                       get_page(hwpoison_page);
-                       return page_to_pfn(hwpoison_page);
+                       return KVM_PFN_ERR_HWPOISON;
                }
 
                vma = find_vma_intersection(current->mm, addr, addr+1);
 
                if (vma == NULL)
-                       pfn = get_fault_pfn();
+                       pfn = KVM_PFN_ERR_FAULT;
                else if ((vma->vm_flags & VM_PFNMAP)) {
                        pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                                vma->vm_pgoff;
@@ -1110,7 +1093,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
                } else {
                        if (async && (vma->vm_flags & VM_WRITE))
                                *async = true;
-                       pfn = get_fault_pfn();
+                       pfn = KVM_PFN_ERR_FAULT;
                }
                up_read(&current->mm->mmap_sem);
        } else
@@ -1119,9 +1102,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
        return pfn;
 }
 
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+pfn_t hva_to_pfn_atomic(unsigned long addr)
 {
-       return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+       return hva_to_pfn(addr, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
@@ -1134,12 +1117,10 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
                *async = false;
 
        addr = gfn_to_hva(kvm, gfn);
-       if (kvm_is_error_hva(addr)) {
-               get_page(bad_page);
-               return page_to_pfn(bad_page);
-       }
+       if (kvm_is_error_hva(addr))
+               return KVM_PFN_ERR_BAD;
 
-       return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+       return hva_to_pfn(addr, atomic, async, write_fault, writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1168,11 +1149,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-                        struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-       return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+       return hva_to_pfn(addr, false, NULL, true, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
@@ -1192,30 +1172,42 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+       if (is_error_pfn(pfn))
+               return KVM_ERR_PTR_BAD_PAGE;
+
+       if (kvm_is_mmio_pfn(pfn)) {
+               WARN_ON(1);
+               return KVM_ERR_PTR_BAD_PAGE;
+       }
+
+       return pfn_to_page(pfn);
+}
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
        pfn_t pfn;
 
        pfn = gfn_to_pfn(kvm, gfn);
-       if (!kvm_is_mmio_pfn(pfn))
-               return pfn_to_page(pfn);
 
-       WARN_ON(kvm_is_mmio_pfn(pfn));
-
-       get_page(bad_page);
-       return bad_page;
+       return kvm_pfn_to_page(pfn);
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
 {
+       WARN_ON(is_error_page(page));
+
        kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
+       WARN_ON(is_error_pfn(pfn));
+
        if (!kvm_is_mmio_pfn(pfn))
                put_page(pfn_to_page(pfn));
 }
@@ -1223,6 +1215,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
 void kvm_release_page_dirty(struct page *page)
 {
+       WARN_ON(is_error_page(page));
+
        kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -1281,7 +1275,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = copy_from_user(data, (void __user *)addr + offset, len);
+       r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
@@ -1337,7 +1331,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
-       r = copy_to_user((void __user *)addr + offset, data, len);
+       r = __copy_to_user((void __user *)addr + offset, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty(kvm, gfn);
@@ -1374,7 +1368,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
        ghc->gpa = gpa;
        ghc->generation = slots->generation;
-       ghc->memslot = __gfn_to_memslot(slots, gfn);
+       ghc->memslot = gfn_to_memslot(kvm, gfn);
        ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
        if (!kvm_is_error_hva(ghc->hva))
                ghc->hva += offset;
@@ -1397,7 +1391,7 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
 
-       r = copy_to_user((void __user *)ghc->hva, data, len);
+       r = __copy_to_user((void __user *)ghc->hva, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
@@ -1406,6 +1400,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+                          void *data, unsigned long len)
+{
+       struct kvm_memslots *slots = kvm_memslots(kvm);
+       int r;
+
+       if (slots->generation != ghc->generation)
+               kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
+
+       if (kvm_is_error_hva(ghc->hva))
+               return -EFAULT;
+
+       r = __copy_from_user(data, (void __user *)ghc->hva, len);
+       if (r)
+               return -EFAULT;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
        return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
@@ -1438,7 +1452,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-               __set_bit_le(rel_gfn, memslot->dirty_bitmap);
+               /* TODO: introduce set_bit_le() and use it */
+               test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap);
        }
 }
 
@@ -1475,6 +1490,30 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        finish_wait(&vcpu->wq, &wait);
 }
 
+#ifndef CONFIG_S390
+/*
+ * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+ */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       int me;
+       int cpu = vcpu->cpu;
+       wait_queue_head_t *wqp;
+
+       wqp = kvm_arch_vcpu_wq(vcpu);
+       if (waitqueue_active(wqp)) {
+               wake_up_interruptible(wqp);
+               ++vcpu->stat.halt_wakeup;
+       }
+
+       me = get_cpu();
+       if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+               if (kvm_arch_vcpu_should_kick(vcpu))
+                       smp_send_reschedule(cpu);
+       put_cpu();
+}
+#endif /* !CONFIG_S390 */
+
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
        if (!need_resched())
@@ -1483,6 +1522,68 @@ void kvm_resched(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_resched);
 
+bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
+{
+       struct pid *pid;
+       struct task_struct *task = NULL;
+
+       rcu_read_lock();
+       pid = rcu_dereference(target->pid);
+       if (pid)
+               task = get_pid_task(target->pid, PIDTYPE_PID);
+       rcu_read_unlock();
+       if (!task)
+               return false;
+       if (task->flags & PF_VCPU) {
+               put_task_struct(task);
+               return false;
+       }
+       if (yield_to(task, 1)) {
+               put_task_struct(task);
+               return true;
+       }
+       put_task_struct(task);
+       return false;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ *  (preempted lock holder), indicated by @in_spin_loop.
+ *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ *  chance last time (mostly it has become eligible now since we have probably
+ *  yielded to lockholder in last iteration. This is done by toggling
+ *  @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ *  to preempted lock-holder could result in wrong VCPU selection and CPU
+ *  burning. Giving priority for a potential lock-holder increases lock
+ *  progress.
+ *
+ *  Since algorithm is based on heuristics, accessing another VCPU data without
+ *  locking does not harm. It may result in trying to yield to  same VCPU, fail
+ *  and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+       bool eligible;
+
+       eligible = !vcpu->spin_loop.in_spin_loop ||
+                       (vcpu->spin_loop.in_spin_loop &&
+                        vcpu->spin_loop.dy_eligible);
+
+       if (vcpu->spin_loop.in_spin_loop)
+               kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+       return eligible;
+}
+#endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
        struct kvm *kvm = me->kvm;
@@ -1492,6 +1593,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
        int pass;
        int i;
 
+       kvm_vcpu_set_in_spin_loop(me, true);
        /*
         * We boost the priority of a VCPU that is runnable but not
         * currently running, because it got preempted by something
@@ -1501,9 +1603,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
         */
        for (pass = 0; pass < 2 && !yielded; pass++) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
-                       struct task_struct *task = NULL;
-                       struct pid *pid;
-                       if (!pass && i < last_boosted_vcpu) {
+                       if (!pass && i <= last_boosted_vcpu) {
                                i = last_boosted_vcpu;
                                continue;
                        } else if (pass && i > last_boosted_vcpu)
@@ -1512,26 +1612,19 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        if (waitqueue_active(&vcpu->wq))
                                continue;
-                       rcu_read_lock();
-                       pid = rcu_dereference(vcpu->pid);
-                       if (pid)
-                               task = get_pid_task(vcpu->pid, PIDTYPE_PID);
-                       rcu_read_unlock();
-                       if (!task)
+                       if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                                continue;
-                       if (task->flags & PF_VCPU) {
-                               put_task_struct(task);
-                               continue;
-                       }
-                       if (yield_to(task, 1)) {
-                               put_task_struct(task);
+                       if (kvm_vcpu_yield_to(vcpu)) {
                                kvm->last_boosted_vcpu = i;
                                yielded = 1;
                                break;
                        }
-                       put_task_struct(task);
                }
        }
+       kvm_vcpu_set_in_spin_loop(me, false);
+
+       /* Ensure vcpu is not eligible during next spinloop */
+       kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
@@ -1551,7 +1644,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
        else
-               return VM_FAULT_SIGBUS;
+               return kvm_arch_vcpu_fault(vcpu, vmf);
        get_page(page);
        vmf->page = page;
        return 0;
@@ -1578,7 +1671,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
-       .compat_ioctl   = kvm_vcpu_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = kvm_vcpu_compat_ioctl,
+#endif
        .mmap           = kvm_vcpu_mmap,
        .llseek         = noop_llseek,
 };
@@ -1607,18 +1702,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
        r = kvm_arch_vcpu_setup(vcpu);
        if (r)
-               return r;
+               goto vcpu_destroy;
 
        mutex_lock(&kvm->lock);
+       if (!kvm_vcpu_compatible(vcpu)) {
+               r = -EINVAL;
+               goto unlock_vcpu_destroy;
+       }
        if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
                r = -EINVAL;
-               goto vcpu_destroy;
+               goto unlock_vcpu_destroy;
        }
 
        kvm_for_each_vcpu(r, v, kvm)
                if (v->vcpu_id == id) {
                        r = -EEXIST;
-                       goto vcpu_destroy;
+                       goto unlock_vcpu_destroy;
                }
 
        BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
@@ -1628,22 +1727,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        r = create_vcpu_fd(vcpu);
        if (r < 0) {
                kvm_put_kvm(kvm);
-               goto vcpu_destroy;
+               goto unlock_vcpu_destroy;
        }
 
        kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
        smp_wmb();
        atomic_inc(&kvm->online_vcpus);
 
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-       if (kvm->bsp_vcpu_id == id)
-               kvm->bsp_vcpu = vcpu;
-#endif
        mutex_unlock(&kvm->lock);
        return r;
 
-vcpu_destroy:
+unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
+vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
        return r;
 }
@@ -1712,12 +1808,11 @@ out_free1:
                struct kvm_regs *kvm_regs;
 
                r = -ENOMEM;
-               kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
-               if (!kvm_regs)
+               kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
+               if (IS_ERR(kvm_regs)) {
+                       r = PTR_ERR(kvm_regs);
                        goto out;
-               r = -EFAULT;
-               if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs)))
-                       goto out_free2;
+               }
                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
                if (r)
                        goto out_free2;
@@ -1741,13 +1836,11 @@ out_free2:
                break;
        }
        case KVM_SET_SREGS: {
-               kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
-               r = -ENOMEM;
-               if (!kvm_sregs)
-                       goto out;
-               r = -EFAULT;
-               if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
+               kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
+               if (IS_ERR(kvm_sregs)) {
+                       r = PTR_ERR(kvm_sregs);
                        goto out;
+               }
                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
                if (r)
                        goto out;
@@ -1843,13 +1936,11 @@ out_free2:
                break;
        }
        case KVM_SET_FPU: {
-               fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
-               r = -ENOMEM;
-               if (!fpu)
-                       goto out;
-               r = -EFAULT;
-               if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
+               fpu = memdup_user(argp, sizeof(*fpu));
+               if (IS_ERR(fpu)) {
+                       r = PTR_ERR(fpu);
                        goto out;
+               }
                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
                if (r)
                        goto out;
@@ -1866,6 +1957,50 @@ out:
        return r;
 }
 
+#ifdef CONFIG_COMPAT
+static long kvm_vcpu_compat_ioctl(struct file *filp,
+                                 unsigned int ioctl, unsigned long arg)
+{
+       struct kvm_vcpu *vcpu = filp->private_data;
+       void __user *argp = compat_ptr(arg);
+       int r;
+
+       if (vcpu->kvm->mm != current->mm)
+               return -EIO;
+
+       switch (ioctl) {
+       case KVM_SET_SIGNAL_MASK: {
+               struct kvm_signal_mask __user *sigmask_arg = argp;
+               struct kvm_signal_mask kvm_sigmask;
+               compat_sigset_t csigset;
+               sigset_t sigset;
+
+               if (argp) {
+                       r = -EFAULT;
+                       if (copy_from_user(&kvm_sigmask, argp,
+                                          sizeof kvm_sigmask))
+                               goto out;
+                       r = -EINVAL;
+                       if (kvm_sigmask.len != sizeof csigset)
+                               goto out;
+                       r = -EFAULT;
+                       if (copy_from_user(&csigset, sigmask_arg->sigset,
+                                          sizeof csigset))
+                               goto out;
+               }
+               sigset_from_compat(&sigset, &csigset);
+               r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+               break;
+       }
+       default:
+               r = kvm_vcpu_ioctl(filp, ioctl, arg);
+       }
+
+out:
+       return r;
+}
+#endif
+
 static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
@@ -1935,7 +2070,7 @@ static long kvm_vm_ioctl(struct file *filp,
                r = -EFAULT;
                if (copy_from_user(&data, argp, sizeof data))
                        goto out;
-               r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
+               r = kvm_irqfd(kvm, &data);
                break;
        }
        case KVM_IOEVENTFD: {
@@ -1957,6 +2092,40 @@ static long kvm_vm_ioctl(struct file *filp,
                        kvm->bsp_vcpu_id = arg;
                mutex_unlock(&kvm->lock);
                break;
+#endif
+#ifdef CONFIG_HAVE_KVM_MSI
+       case KVM_SIGNAL_MSI: {
+               struct kvm_msi msi;
+
+               r = -EFAULT;
+               if (copy_from_user(&msi, argp, sizeof msi))
+                       goto out;
+               r = kvm_send_userspace_msi(kvm, &msi);
+               break;
+       }
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+       case KVM_IRQ_LINE_STATUS:
+       case KVM_IRQ_LINE: {
+               struct kvm_irq_level irq_event;
+
+               r = -EFAULT;
+               if (copy_from_user(&irq_event, argp, sizeof irq_event))
+                       goto out;
+
+               r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (ioctl == KVM_IRQ_LINE_STATUS) {
+                       if (copy_to_user(argp, &irq_event, sizeof irq_event))
+                               goto out;
+               }
+
+               r = 0;
+               break;
+       }
 #endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -2054,12 +2223,12 @@ static struct file_operations kvm_vm_fops = {
        .llseek         = noop_llseek,
 };
 
-static int kvm_dev_ioctl_create_vm(void)
+static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
        int r;
        struct kvm *kvm;
 
-       kvm = kvm_create_vm();
+       kvm = kvm_create_vm(type);
        if (IS_ERR(kvm))
                return PTR_ERR(kvm);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -2086,8 +2255,11 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
        case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
        case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+       case KVM_CAP_SIGNAL_MSI:
+#endif
                return 1;
-#ifdef CONFIG_HAVE_KVM_IRQCHIP
+#ifdef KVM_CAP_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
 #endif
@@ -2110,10 +2282,7 @@ static long kvm_dev_ioctl(struct file *filp,
                r = KVM_API_VERSION;
                break;
        case KVM_CREATE_VM:
-               r = -EINVAL;
-               if (arg)
-                       goto out;
-               r = kvm_dev_ioctl_create_vm();
+               r = kvm_dev_ioctl_create_vm(arg);
                break;
        case KVM_CHECK_EXTENSION:
                r = kvm_dev_ioctl_check_extension_generic(arg);
@@ -2293,24 +2462,89 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        int i;
 
        for (i = 0; i < bus->dev_count; i++) {
-               struct kvm_io_device *pos = bus->devs[i];
+               struct kvm_io_device *pos = bus->range[i].dev;
 
                kvm_iodevice_destructor(pos);
        }
        kfree(bus);
 }
 
+int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+{
+       const struct kvm_io_range *r1 = p1;
+       const struct kvm_io_range *r2 = p2;
+
+       if (r1->addr < r2->addr)
+               return -1;
+       if (r1->addr + r1->len > r2->addr + r2->len)
+               return 1;
+       return 0;
+}
+
+int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+                         gpa_t addr, int len)
+{
+       bus->range[bus->dev_count++] = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+               .dev = dev,
+       };
+
+       sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
+               kvm_io_bus_sort_cmp, NULL);
+
+       return 0;
+}
+
+int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+                            gpa_t addr, int len)
+{
+       struct kvm_io_range *range, key;
+       int off;
+
+       key = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
+
+       range = bsearch(&key, bus->range, bus->dev_count,
+                       sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+       if (range == NULL)
+               return -ENOENT;
+
+       off = range - bus->range;
+
+       while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
+               off--;
+
+       return off;
+}
+
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
 {
-       int i;
+       int idx;
        struct kvm_io_bus *bus;
+       struct kvm_io_range range;
+
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
 
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       for (i = 0; i < bus->dev_count; i++)
-               if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+       idx = kvm_io_bus_get_first_dev(bus, addr, len);
+       if (idx < 0)
+               return -EOPNOTSUPP;
+
+       while (idx < bus->dev_count &&
+               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
                        return 0;
+               idx++;
+       }
+
        return -EOPNOTSUPP;
 }
 
@@ -2318,31 +2552,47 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
 {
-       int i;
+       int idx;
        struct kvm_io_bus *bus;
+       struct kvm_io_range range;
+
+       range = (struct kvm_io_range) {
+               .addr = addr,
+               .len = len,
+       };
 
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-       for (i = 0; i < bus->dev_count; i++)
-               if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
+       idx = kvm_io_bus_get_first_dev(bus, addr, len);
+       if (idx < 0)
+               return -EOPNOTSUPP;
+
+       while (idx < bus->dev_count &&
+               kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
+               if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
                        return 0;
+               idx++;
+       }
+
        return -EOPNOTSUPP;
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                           struct kvm_io_device *dev)
+int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+                           int len, struct kvm_io_device *dev)
 {
        struct kvm_io_bus *new_bus, *bus;
 
        bus = kvm->buses[bus_idx];
-       if (bus->dev_count > NR_IOBUS_DEVS-1)
+       if (bus->dev_count > NR_IOBUS_DEVS - 1)
                return -ENOSPC;
 
-       new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
+       new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+                         sizeof(struct kvm_io_range)), GFP_KERNEL);
        if (!new_bus)
                return -ENOMEM;
-       memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-       new_bus->devs[new_bus->dev_count++] = dev;
+       memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
+              sizeof(struct kvm_io_range)));
+       kvm_io_bus_insert_dev(new_bus, dev, addr, len);
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
@@ -2357,25 +2607,26 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        int i, r;
        struct kvm_io_bus *new_bus, *bus;
 
-       new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL);
-       if (!new_bus)
-               return -ENOMEM;
-
        bus = kvm->buses[bus_idx];
-       memcpy(new_bus, bus, sizeof(struct kvm_io_bus));
-
        r = -ENOENT;
-       for (i = 0; i < new_bus->dev_count; i++)
-               if (new_bus->devs[i] == dev) {
+       for (i = 0; i < bus->dev_count; i++)
+               if (bus->range[i].dev == dev) {
                        r = 0;
-                       new_bus->devs[i] = new_bus->devs[--new_bus->dev_count];
                        break;
                }
 
-       if (r) {
-               kfree(new_bus);
+       if (r)
                return r;
-       }
+
+       new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+                         sizeof(struct kvm_io_range)), GFP_KERNEL);
+       if (!new_bus)
+               return -ENOMEM;
+
+       memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+       new_bus->dev_count--;
+       memcpy(new_bus->range + i, bus->range + i + 1,
+              (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
@@ -2426,15 +2677,29 @@ static const struct file_operations *stat_fops[] = {
        [KVM_STAT_VM]   = &vm_stat_fops,
 };
 
-static void kvm_init_debug(void)
+static int kvm_init_debug(void)
 {
+       int r = -EFAULT;
        struct kvm_stats_debugfs_item *p;
 
        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
-       for (p = debugfs_entries; p->name; ++p)
+       if (kvm_debugfs_dir == NULL)
+               goto out;
+
+       for (p = debugfs_entries; p->name; ++p) {
                p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
                                                (void *)(long)p->offset,
                                                stat_fops[p->kind]);
+               if (p->dentry == NULL)
+                       goto out_dir;
+       }
+
+       return 0;
+
+out_dir:
+       debugfs_remove_recursive(kvm_debugfs_dir);
+out:
+       return r;
 }
 
 static void kvm_exit_debug(void)
@@ -2466,9 +2731,6 @@ static struct syscore_ops kvm_syscore_ops = {
        .resume = kvm_resume,
 };
 
-struct page *bad_page;
-pfn_t bad_pfn;
-
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
@@ -2500,33 +2762,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        if (r)
                goto out_fail;
 
-       bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (bad_page == NULL) {
-               r = -ENOMEM;
-               goto out;
-       }
-
-       bad_pfn = page_to_pfn(bad_page);
-
-       hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (hwpoison_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
-
-       hwpoison_pfn = page_to_pfn(hwpoison_page);
-
-       fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-       if (fault_page == NULL) {
-               r = -ENOMEM;
-               goto out_free_0;
-       }
-
-       fault_pfn = page_to_pfn(fault_page);
-
        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
                r = -ENOMEM;
                goto out_free_0;
@@ -2578,10 +2813,16 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_preempt_ops.sched_out = kvm_sched_out;
 
-       kvm_init_debug();
+       r = kvm_init_debug();
+       if (r) {
+               printk(KERN_ERR "kvm: create debugfs files failed\n");
+               goto out_undebugfs;
+       }
 
        return 0;
 
+out_undebugfs:
+       unregister_syscore_ops(&kvm_syscore_ops);
 out_unreg:
        kvm_async_pf_deinit();
 out_free:
@@ -2595,12 +2836,6 @@ out_free_1:
 out_free_0a:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
-       if (fault_page)
-               __free_page(fault_page);
-       if (hwpoison_page)
-               __free_page(hwpoison_page);
-       __free_page(bad_page);
-out:
        kvm_arch_exit();
 out_fail:
        return r;
@@ -2620,7 +2855,5 @@ void kvm_exit(void)
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        free_cpumask_var(cpus_hardware_enabled);
-       __free_page(hwpoison_page);
-       __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);