KVM: remove unnecessary return value check
[linux-3.10.git] / arch / x86 / kvm / x86.c
index 2a59f76..c31f75d 100644 (file)
@@ -57,6 +57,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/i387.h>
+#include <asm/fpu-internal.h> /* Ugh! */
 #include <asm/xcr.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
@@ -157,7 +158,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 
 u64 __read_mostly host_xcr0;
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -245,20 +248,14 @@ static void drop_user_return_notifiers(void *ignore)
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
-       if (irqchip_in_kernel(vcpu->kvm))
-               return vcpu->arch.apic_base;
-       else
-               return vcpu->arch.apic_base;
+       return vcpu->arch.apic_base;
 }
 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 {
        /* TODO: reserve bits check */
-       if (irqchip_in_kernel(vcpu->kvm))
-               kvm_lapic_set_base(vcpu, data);
-       else
-               vcpu->arch.apic_base = data;
+       kvm_lapic_set_base(vcpu, data);
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
@@ -527,6 +524,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
                        return 1;
        }
 
+       if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+               return 1;
+
        kvm_x86_ops->set_cr0(vcpu, cr0);
 
        if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -603,10 +603,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                                   kvm_read_cr3(vcpu)))
                return 1;
 
+       if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+               if (!guest_cpuid_has_pcid(vcpu))
+                       return 1;
+
+               /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+                       return 1;
+       }
+
        if (kvm_x86_ops->set_cr4(vcpu, cr4))
                return 1;
 
-       if ((cr4 ^ old_cr4) & pdptr_bits)
+       if (((cr4 ^ old_cr4) & pdptr_bits) ||
+           (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                kvm_mmu_reset_context(vcpu);
 
        if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -625,8 +635,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
 
        if (is_long_mode(vcpu)) {
-               if (cr3 & CR3_L_MODE_RESERVED_BITS)
-                       return 1;
+               if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
+                       if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
+                               return 1;
+               } else
+                       if (cr3 & CR3_L_MODE_RESERVED_BITS)
+                               return 1;
        } else {
                if (is_pae(vcpu)) {
                        if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -680,6 +694,18 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+       unsigned long dr7;
+
+       if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+               dr7 = vcpu->arch.guest_debug_dr7;
+       else
+               dr7 = vcpu->arch.dr7;
+       kvm_x86_ops->set_dr7(vcpu, dr7);
+       vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+}
+
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
        switch (dr) {
@@ -705,10 +731,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
                if (val & 0xffffffff00000000ULL)
                        return -1; /* #GP */
                vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-                       kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
-                       vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
-               }
+               kvm_update_dr7(vcpu);
                break;
        }
 
@@ -788,12 +811,13 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN    9
+#define KVM_SAVE_MSRS_BEGIN    10
 static u32 msrs_to_save[] = {
        MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
        HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
        HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+       MSR_KVM_PV_EOI_EN,
        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
        MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -804,7 +828,7 @@ static u32 msrs_to_save[] = {
 
 static unsigned num_msrs_to_save;
 
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs[] = {
        MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
@@ -906,6 +930,10 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
         */
        getboottime(&boot);
 
+       if (kvm->arch.kvmclock_offset) {
+               struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
+               boot = timespec_sub(boot, ts);
+       }
        wc.sec = boot.tv_sec;
        wc.nsec = boot.tv_nsec;
        wc.version = version;
@@ -1013,10 +1041,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 {
-       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
                                      vcpu->arch.virtual_tsc_mult,
                                      vcpu->arch.virtual_tsc_shift);
-       tsc += vcpu->arch.last_tsc_write;
+       tsc += vcpu->arch.this_tsc_write;
        return tsc;
 }
 
@@ -1025,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
-       s64 nsdiff;
+       s64 usdiff;
 
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1033,18 +1061,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
        /* n.b - signed multiplication and division required */
-       nsdiff = data - kvm->arch.last_tsc_write;
+       usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
-       nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+       usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
        /* do_div() only does unsigned */
        asm("idivl %2; xor %%edx, %%edx"
-           : "=A"(nsdiff)
-           : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+           : "=A"(usdiff)
+           : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
 #endif
-       nsdiff -= elapsed;
-       if (nsdiff < 0)
-               nsdiff = -nsdiff;
+       do_div(elapsed, 1000);
+       usdiff -= elapsed;
+       if (usdiff < 0)
+               usdiff = -usdiff;
 
        /*
         * Special case: TSC write with a small delta (1 second) of virtual
@@ -1056,10 +1085,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
         * compensation code attempt to catch up if we fall behind, but
         * it's better to try to match offsets from the beginning.
          */
-       if (nsdiff < NSEC_PER_SEC &&
+       if (usdiff < USEC_PER_SEC &&
            vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
                if (!check_tsc_unstable()) {
-                       offset = kvm->arch.last_tsc_offset;
+                       offset = kvm->arch.cur_tsc_offset;
                        pr_debug("kvm: matched tsc offset for %llu\n", data);
                } else {
                        u64 delta = nsec_to_cycles(vcpu, elapsed);
@@ -1067,20 +1096,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
+       } else {
+               /*
+                * We split periods of matched TSC writes into generations.
+                * For each generation, we track the original measured
+                * nanosecond time, offset, and write, so if TSCs are in
+                * sync, we can match exact offset, and if not, we can match
+                * exact software computation in compute_guest_tsc()
+                *
+                * These values are tracked in kvm->arch.cur_xxx variables.
+                */
+               kvm->arch.cur_tsc_generation++;
+               kvm->arch.cur_tsc_nsec = ns;
+               kvm->arch.cur_tsc_write = data;
+               kvm->arch.cur_tsc_offset = offset;
+               pr_debug("kvm: new tsc generation %u, clock %llu\n",
+                        kvm->arch.cur_tsc_generation, data);
        }
+
+       /*
+        * We also track th most recent recorded KHZ, write and time to
+        * allow the matching interval to be extended at each write.
+        */
        kvm->arch.last_tsc_nsec = ns;
        kvm->arch.last_tsc_write = data;
-       kvm->arch.last_tsc_offset = offset;
        kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
-       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
        /* Reset of TSC must disable overshoot protection below */
        vcpu->arch.hv_clock.tsc_timestamp = 0;
-       vcpu->arch.last_tsc_write = data;
-       vcpu->arch.last_tsc_nsec = ns;
        vcpu->arch.last_guest_tsc = data;
+
+       /* Keep track of which generation this VCPU has synchronized to */
+       vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+       vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+       vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 }
+
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1091,6 +1145,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
        u64 tsc_timestamp;
+       u8 pvclock_flags;
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -1116,7 +1171,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        if (vcpu->tsc_catchup) {
                u64 tsc = compute_guest_tsc(v, kernel_ns);
                if (tsc > tsc_timestamp) {
-                       kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+                       adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
                        tsc_timestamp = tsc;
                }
        }
@@ -1172,7 +1227,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
-       vcpu->hv_clock.flags = 0;
+
+       pvclock_flags = 0;
+       if (vcpu->pvclock_set_guest_stopped_request) {
+               pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+               vcpu->pvclock_set_guest_stopped_request = false;
+       }
+
+       vcpu->hv_clock.flags = pvclock_flags;
 
        /*
         * The interface expects us to write an even number signaling that the
@@ -1181,12 +1243,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         */
        vcpu->hv_clock.version += 2;
 
-       shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+       shared_kaddr = kmap_atomic(vcpu->time_page);
 
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
               sizeof(vcpu->hv_clock));
 
-       kunmap_atomic(shared_kaddr, KM_USER0);
+       kunmap_atomic(shared_kaddr);
 
        mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
        return 0;
@@ -1410,8 +1472,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                break;
        }
        default:
-               pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-                         "data 0x%llx\n", msr, data);
+               vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+                           "data 0x%llx\n", msr, data);
                return 1;
        }
        return 0;
@@ -1443,8 +1505,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case HV_X64_MSR_TPR:
                return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
        default:
-               pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-                         "data 0x%llx\n", msr, data);
+               vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+                           "data 0x%llx\n", msr, data);
                return 1;
        }
 
@@ -1455,7 +1517,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
        gpa_t gpa = data & ~0x3f;
 
-       /* Bits 2:5 are resrved, Should be zero */
+       /* Bits 2:5 are reserved, Should be zero */
        if (data & 0x3c)
                return 1;
 
@@ -1522,16 +1584,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
+               data &= ~(u64)0x8;      /* ignore TLB cache disable */
                if (data != 0) {
-                       pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
-                               data);
+                       vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+                                   data);
                        return 1;
                }
                break;
        case MSR_FAM10H_MMIO_CONF_BASE:
                if (data != 0) {
-                       pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
-                               "0x%llx\n", data);
+                       vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+                                   "0x%llx\n", data);
                        return 1;
                }
                break;
@@ -1546,8 +1609,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                           thus reserved and should throw a #GP */
                        return 1;
                }
-               pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
-                       __func__, data);
+               vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+                           __func__, data);
                break;
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
@@ -1589,10 +1652,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
 
-               if (is_error_page(vcpu->arch.time_page)) {
-                       kvm_release_page_clean(vcpu->arch.time_page);
+               if (is_error_page(vcpu->arch.time_page))
                        vcpu->arch.time_page = NULL;
-               }
+
                break;
        }
        case MSR_KVM_ASYNC_PF_EN:
@@ -1625,6 +1687,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
                break;
+       case MSR_KVM_PV_EOI_EN:
+               if (kvm_lapic_enable_pv_eoi(vcpu, data))
+                       return 1;
+               break;
 
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
@@ -1643,8 +1709,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_K7_EVNTSEL2:
        case MSR_K7_EVNTSEL3:
                if (data != 0)
-                       pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-                               "0x%x data 0x%llx\n", msr, data);
+                       vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                                   "0x%x data 0x%llx\n", msr, data);
                break;
        /* at least RHEL 4 unconditionally writes to the perfctr registers,
         * so we ignore writes to make it happy.
@@ -1653,8 +1719,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_K7_PERFCTR1:
        case MSR_K7_PERFCTR2:
        case MSR_K7_PERFCTR3:
-               pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-                       "0x%x data 0x%llx\n", msr, data);
+               vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                           "0x%x data 0x%llx\n", msr, data);
                break;
        case MSR_P6_PERFCTR0:
        case MSR_P6_PERFCTR1:
@@ -1665,15 +1731,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                        return kvm_pmu_set_msr(vcpu, msr, data);
 
                if (pr || data != 0)
-                       pr_unimpl(vcpu, "disabled perfctr wrmsr: "
-                               "0x%x data 0x%llx\n", msr, data);
+                       vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+                                   "0x%x data 0x%llx\n", msr, data);
                break;
        case MSR_K7_CLK_CTL:
                /*
                 * Ignore all writes to this no longer documented MSR.
                 * Writes are only relevant for old K7 processors,
                 * all pre-dating SVM, but a recommended workaround from
-                * AMD for these chips. It is possible to speicify the
+                * AMD for these chips. It is possible to specify the
                 * affected processor models on the command line, hence
                 * the need to ignore the workaround.
                 */
@@ -1692,7 +1758,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                /* Drop writes to this legacy MSR -- see rdmsr
                 * counterpart for further detail.
                 */
-               pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
+               vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
                break;
        case MSR_AMD64_OSVW_ID_LENGTH:
                if (!guest_cpuid_has_osvw(vcpu))
@@ -1710,12 +1776,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                if (kvm_pmu_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr, data);
                if (!ignore_msrs) {
-                       pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
-                               msr, data);
+                       vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+                                   msr, data);
                        return 1;
                } else {
-                       pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
-                               msr, data);
+                       vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+                                   msr, data);
                        break;
                }
        }
@@ -1818,7 +1884,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                data = kvm->arch.hv_hypercall;
                break;
        default:
-               pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+               vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
                return 1;
        }
 
@@ -1849,7 +1915,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                data = vcpu->arch.hv_vapic;
                break;
        default:
-               pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+               vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
                return 1;
        }
        *pdata = data;
@@ -1946,6 +2012,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_KVM_STEAL_TIME:
                data = vcpu->arch.st.msr_val;
                break;
+       case MSR_KVM_PV_EOI_EN:
+               data = vcpu->arch.pv_eoi.msr_val;
+               break;
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
        case MSR_IA32_MCG_CAP:
@@ -2002,10 +2071,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                if (kvm_pmu_msr(vcpu, msr))
                        return kvm_pmu_get_msr(vcpu, msr, pdata);
                if (!ignore_msrs) {
-                       pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+                       vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                        return 1;
                } else {
-                       pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+                       vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
                        data = 0;
                }
                break;
@@ -2118,6 +2187,10 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_XSAVE:
        case KVM_CAP_ASYNC_PF:
        case KVM_CAP_GET_TSC_KHZ:
+       case KVM_CAP_PCI_2_3:
+       case KVM_CAP_KVMCLOCK_CTRL:
+       case KVM_CAP_READONLY_MEM:
+       case KVM_CAP_IRQFD_RESAMPLE:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -2252,14 +2325,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        }
 
        kvm_x86_ops->vcpu_load(vcpu, cpu);
-       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-               /* Make sure TSC doesn't go backwards */
-               s64 tsc_delta;
-               u64 tsc;
 
-               tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-               tsc_delta = tsc - vcpu->arch.last_guest_tsc;
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+               adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+               vcpu->arch.tsc_offset_adjustment = 0;
+               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+       }
 
+       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+               s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+                               native_read_tsc() - vcpu->arch.last_host_tsc;
                if (tsc_delta < 0)
                        mark_tsc_unstable("KVM discovered backwards TSC");
                if (check_tsc_unstable()) {
@@ -2282,7 +2358,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        kvm_x86_ops->vcpu_put(vcpu);
        kvm_put_guest_fpu(vcpu);
-       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+       vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2296,8 +2372,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
-       kvm_apic_post_state_restore(vcpu);
+       kvm_apic_post_state_restore(vcpu, s);
        update_cr8_intercept(vcpu);
 
        return 0;
@@ -2306,7 +2381,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                                    struct kvm_interrupt *irq)
 {
-       if (irq->irq < 0 || irq->irq >= 256)
+       if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
                return -EINVAL;
        if (irqchip_in_kernel(vcpu->kvm))
                return -ENXIO;
@@ -2565,6 +2640,21 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
        return r;
 }
 
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor.  This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.time_page)
+               return -EINVAL;
+       vcpu->arch.pvclock_set_guest_stopped_request = true;
+       kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+       return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -2599,19 +2689,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_SET_LAPIC: {
-               r = -EINVAL;
                if (!vcpu->arch.apic)
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
-               if (IS_ERR(u.lapic)) {
-                       r = PTR_ERR(u.lapic);
-                       goto out;
-               }
+               if (IS_ERR(u.lapic))
+                       return PTR_ERR(u.lapic);
 
                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_INTERRUPT: {
@@ -2621,16 +2705,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&irq, argp, sizeof irq))
                        goto out;
                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_NMI: {
                r = kvm_vcpu_ioctl_nmi(vcpu);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_SET_CPUID: {
@@ -2641,8 +2719,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_SET_CPUID2: {
@@ -2654,8 +2730,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
                                              cpuid_arg->entries);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_GET_CPUID2: {
@@ -2787,10 +2861,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XSAVE: {
                u.xsave = memdup_user(argp, sizeof(*u.xsave));
-               if (IS_ERR(u.xsave)) {
-                       r = PTR_ERR(u.xsave);
-                       goto out;
-               }
+               if (IS_ERR(u.xsave))
+                       return PTR_ERR(u.xsave);
 
                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                break;
@@ -2812,10 +2884,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XCRS: {
                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-               if (IS_ERR(u.xcrs)) {
-                       r = PTR_ERR(u.xcrs);
-                       goto out;
-               }
+               if (IS_ERR(u.xcrs))
+                       return PTR_ERR(u.xcrs);
 
                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
                break;
@@ -2841,6 +2911,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = vcpu->arch.virtual_tsc_khz;
                goto out;
        }
+       case KVM_KVMCLOCK_CTRL: {
+               r = kvm_set_guest_paused(vcpu);
+               goto out;
+       }
        default:
                r = -EINVAL;
        }
@@ -2859,7 +2933,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
        int ret;
 
        if (addr > (unsigned int)(-3 * PAGE_SIZE))
-               return -1;
+               return -EINVAL;
        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
        return ret;
 }
@@ -3007,64 +3081,38 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
        if (!kvm->arch.vpit)
                return -ENXIO;
        mutex_lock(&kvm->arch.vpit->pit_state.lock);
-       kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
        mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return 0;
 }
 
 /**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
  *
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- *    checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
  *
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
  *
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem.  That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
  */
-static void write_protect_slot(struct kvm *kvm,
-                              struct kvm_memory_slot *memslot,
-                              unsigned long *dirty_bitmap,
-                              unsigned long nr_dirty_pages)
-{
-       /* Not many dirty pages compared to # of shadow pages. */
-       if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
-               unsigned long gfn_offset;
-
-               for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
-                       unsigned long gfn = memslot->base_gfn + gfn_offset;
-
-                       spin_lock(&kvm->mmu_lock);
-                       kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-                       spin_unlock(&kvm->mmu_lock);
-               }
-               kvm_flush_remote_tlbs(kvm);
-       } else {
-               spin_lock(&kvm->mmu_lock);
-               kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-               spin_unlock(&kvm->mmu_lock);
-       }
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
-                                     struct kvm_dirty_log *log)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
        int r;
        struct kvm_memory_slot *memslot;
-       unsigned long n, nr_dirty_pages;
+       unsigned long n, i;
+       unsigned long *dirty_bitmap;
+       unsigned long *dirty_bitmap_buffer;
+       bool is_dirty = false;
 
        mutex_lock(&kvm->slots_lock);
 
@@ -3073,49 +3121,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                goto out;
 
        memslot = id_to_memslot(kvm->memslots, log->slot);
+
+       dirty_bitmap = memslot->dirty_bitmap;
        r = -ENOENT;
-       if (!memslot->dirty_bitmap)
+       if (!dirty_bitmap)
                goto out;
 
        n = kvm_dirty_bitmap_bytes(memslot);
-       nr_dirty_pages = memslot->nr_dirty_pages;
 
-       /* If nothing is dirty, don't bother messing with page tables. */
-       if (nr_dirty_pages) {
-               struct kvm_memslots *slots, *old_slots;
-               unsigned long *dirty_bitmap, *dirty_bitmap_head;
+       dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+       memset(dirty_bitmap_buffer, 0, n);
 
-               dirty_bitmap = memslot->dirty_bitmap;
-               dirty_bitmap_head = memslot->dirty_bitmap_head;
-               if (dirty_bitmap == dirty_bitmap_head)
-                       dirty_bitmap_head += n / sizeof(long);
-               memset(dirty_bitmap_head, 0, n);
+       spin_lock(&kvm->mmu_lock);
 
-               r = -ENOMEM;
-               slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
-               if (!slots)
-                       goto out;
+       for (i = 0; i < n / sizeof(long); i++) {
+               unsigned long mask;
+               gfn_t offset;
 
-               memslot = id_to_memslot(slots, log->slot);
-               memslot->nr_dirty_pages = 0;
-               memslot->dirty_bitmap = dirty_bitmap_head;
-               update_memslots(slots, NULL);
+               if (!dirty_bitmap[i])
+                       continue;
 
-               old_slots = kvm->memslots;
-               rcu_assign_pointer(kvm->memslots, slots);
-               synchronize_srcu_expedited(&kvm->srcu);
-               kfree(old_slots);
+               is_dirty = true;
 
-               write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
+               mask = xchg(&dirty_bitmap[i], 0);
+               dirty_bitmap_buffer[i] = mask;
 
-               r = -EFAULT;
-               if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-                       goto out;
-       } else {
-               r = -EFAULT;
-               if (clear_user(log->dirty_bitmap, n))
-                       goto out;
+               offset = i * BITS_PER_LONG;
+               kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
        }
+       if (is_dirty)
+               kvm_flush_remote_tlbs(kvm);
+
+       spin_unlock(&kvm->mmu_lock);
+
+       r = -EFAULT;
+       if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+               goto out;
 
        r = 0;
 out:
@@ -3123,6 +3164,16 @@ out:
        return r;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+       if (!irqchip_in_kernel(kvm))
+               return -ENXIO;
+
+       irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event->irq, irq_event->level);
+       return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg)
 {
@@ -3143,8 +3194,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_SET_TSS_ADDR:
                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-               if (r < 0)
-                       goto out;
                break;
        case KVM_SET_IDENTITY_MAP_ADDR: {
                u64 ident_addr;
@@ -3153,14 +3202,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
                        goto out;
                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-               if (r < 0)
-                       goto out;
                break;
        }
        case KVM_SET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-               if (r)
-                       goto out;
                break;
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3172,6 +3217,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = -EEXIST;
                if (kvm->arch.vpic)
                        goto create_irqchip_unlock;
+               r = -EINVAL;
+               if (atomic_read(&kvm->online_vcpus))
+                       goto create_irqchip_unlock;
                r = -ENOMEM;
                vpic = kvm_create_pic(kvm);
                if (vpic) {
@@ -3226,29 +3274,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        create_pit_unlock:
                mutex_unlock(&kvm->slots_lock);
                break;
-       case KVM_IRQ_LINE_STATUS:
-       case KVM_IRQ_LINE: {
-               struct kvm_irq_level irq_event;
-
-               r = -EFAULT;
-               if (copy_from_user(&irq_event, argp, sizeof irq_event))
-                       goto out;
-               r = -ENXIO;
-               if (irqchip_in_kernel(kvm)) {
-                       __s32 status;
-                       status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                       irq_event.irq, irq_event.level);
-                       if (ioctl == KVM_IRQ_LINE_STATUS) {
-                               r = -EFAULT;
-                               irq_event.status = status;
-                               if (copy_to_user(argp, &irq_event,
-                                                       sizeof irq_event))
-                                       goto out;
-                       }
-                       r = 0;
-               }
-               break;
-       }
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
                struct kvm_irqchip *chip;
@@ -3271,8 +3296,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        get_irqchip_out:
                kfree(chip);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_SET_IRQCHIP: {
@@ -3294,8 +3317,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        set_irqchip_out:
                kfree(chip);
-               if (r)
-                       goto out;
                break;
        }
        case KVM_GET_PIT: {
@@ -3322,9 +3343,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_GET_PIT2: {
@@ -3348,9 +3366,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_REINJECT_CONTROL: {
@@ -3359,9 +3374,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&control, argp, sizeof(control)))
                        goto out;
                r = kvm_vm_ioctl_reinject(kvm, &control);
-               if (r)
-                       goto out;
-               r = 0;
                break;
        }
        case KVM_XEN_HVM_CONFIG: {
@@ -3636,20 +3648,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                                gpa_t *gpa, struct x86_exception *exception,
                                bool write)
 {
-       u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+       u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+               | (write ? PFERR_WRITE_MASK : 0);
 
-       if (vcpu_match_mmio_gva(vcpu, gva) &&
-                 check_write_user_access(vcpu, write, access,
-                 vcpu->arch.access)) {
+       if (vcpu_match_mmio_gva(vcpu, gva)
+           && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
                *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                        (gva & (PAGE_SIZE - 1));
                trace_vcpu_match_mmio(gva, *gpa, write, false);
                return 1;
        }
 
-       if (write)
-               access |= PFERR_WRITE_MASK;
-
        *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 
        if (*gpa == UNMAPPED_GVA)
@@ -3694,9 +3703,8 @@ struct read_write_emulator_ops {
 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
        if (vcpu->mmio_read_completed) {
-               memcpy(val, vcpu->mmio_data, bytes);
                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-                              vcpu->mmio_phys_addr, *(u64 *)val);
+                              vcpu->mmio_fragments[0].gpa, *(u64 *)val);
                vcpu->mmio_read_completed = 0;
                return 1;
        }
@@ -3732,19 +3740,20 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
                           void *val, int bytes)
 {
-       memcpy(vcpu->mmio_data, val, bytes);
-       memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+       struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+       memcpy(vcpu->run->mmio.data, frag->data, frag->len);
        return X86EMUL_CONTINUE;
 }
 
-static struct read_write_emulator_ops read_emultor = {
+static const struct read_write_emulator_ops read_emultor = {
        .read_write_prepare = read_prepare,
        .read_write_emulate = read_emulate,
        .read_write_mmio = vcpu_mmio_read,
        .read_write_exit_mmio = read_exit_mmio,
 };
 
-static struct read_write_emulator_ops write_emultor = {
+static const struct read_write_emulator_ops write_emultor = {
        .read_write_emulate = write_emulate,
        .read_write_mmio = write_mmio,
        .read_write_exit_mmio = write_exit_mmio,
@@ -3755,15 +3764,12 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
                                       unsigned int bytes,
                                       struct x86_exception *exception,
                                       struct kvm_vcpu *vcpu,
-                                      struct read_write_emulator_ops *ops)
+                                      const struct read_write_emulator_ops *ops)
 {
        gpa_t gpa;
        int handled, ret;
        bool write = ops->write;
-
-       if (ops->read_write_prepare &&
-                 ops->read_write_prepare(vcpu, val, bytes))
-               return X86EMUL_CONTINUE;
+       struct kvm_mmio_fragment *frag;
 
        ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 
@@ -3789,27 +3795,39 @@ mmio:
        bytes -= handled;
        val += handled;
 
-       vcpu->mmio_needed = 1;
-       vcpu->run->exit_reason = KVM_EXIT_MMIO;
-       vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-       vcpu->mmio_size = bytes;
-       vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
-       vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
-       vcpu->mmio_index = 0;
+       while (bytes) {
+               unsigned now = min(bytes, 8U);
 
-       return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+               frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+               frag->gpa = gpa;
+               frag->data = val;
+               frag->len = now;
+
+               gpa += now;
+               val += now;
+               bytes -= now;
+       }
+       return X86EMUL_CONTINUE;
 }
 
 int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
                        void *val, unsigned int bytes,
                        struct x86_exception *exception,
-                       struct read_write_emulator_ops *ops)
+                       const struct read_write_emulator_ops *ops)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       gpa_t gpa;
+       int rc;
+
+       if (ops->read_write_prepare &&
+                 ops->read_write_prepare(vcpu, val, bytes))
+               return X86EMUL_CONTINUE;
+
+       vcpu->mmio_nr_fragments = 0;
 
        /* Crossing a page boundary? */
        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
-               int rc, now;
+               int now;
 
                now = -addr & ~PAGE_MASK;
                rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3822,8 +3840,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
                bytes -= now;
        }
 
-       return emulator_read_write_onepage(addr, val, bytes, exception,
-                                          vcpu, ops);
+       rc = emulator_read_write_onepage(addr, val, bytes, exception,
+                                        vcpu, ops);
+       if (rc != X86EMUL_CONTINUE)
+               return rc;
+
+       if (!vcpu->mmio_nr_fragments)
+               return rc;
+
+       gpa = vcpu->mmio_fragments[0].gpa;
+
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_cur_fragment = 0;
+
+       vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
+       vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+       vcpu->run->mmio.phys_addr = gpa;
+
+       return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 }
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -3883,12 +3918,10 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                goto emul_write;
 
        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                goto emul_write;
-       }
 
-       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr = kmap_atomic(page);
        kaddr += offset_in_page(gpa);
        switch (bytes) {
        case 1:
@@ -3906,7 +3939,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
        default:
                BUG();
        }
-       kunmap_atomic(kaddr, KM_USER0);
+       kunmap_atomic(kaddr);
        kvm_release_page_dirty(page);
 
        if (!exchanged)
@@ -4066,7 +4099,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
                value = kvm_get_cr8(vcpu);
                break;
        default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+               kvm_err("%s: unexpected cr %u\n", __func__, cr);
                return 0;
        }
 
@@ -4095,13 +4128,18 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
                res = kvm_set_cr8(vcpu, val);
                break;
        default:
-               vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+               kvm_err("%s: unexpected cr %u\n", __func__, cr);
                res = -1;
        }
 
        return res;
 }
 
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+       kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 {
        return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4242,29 +4280,25 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
        return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 }
 
-static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
                               u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
 {
-       struct kvm_cpuid_entry2 *cpuid = NULL;
-
-       if (eax && ecx)
-               cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
-                                           *eax, *ecx);
+       kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
+}
 
-       if (cpuid) {
-               *eax = cpuid->eax;
-               *ecx = cpuid->ecx;
-               if (ebx)
-                       *ebx = cpuid->ebx;
-               if (edx)
-                       *edx = cpuid->edx;
-               return true;
-       }
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+       return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
 
-       return false;
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+       kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 }
 
-static struct x86_emulate_ops emulate_ops = {
+static const struct x86_emulate_ops emulate_ops = {
+       .read_gpr            = emulator_read_gpr,
+       .write_gpr           = emulator_write_gpr,
        .read_std            = kvm_read_guest_virt_system,
        .write_std           = kvm_write_guest_virt_system,
        .fetch               = kvm_fetch_guest_virt,
@@ -4283,6 +4317,7 @@ static struct x86_emulate_ops emulate_ops = {
        .set_idt             = emulator_set_idt,
        .get_cr              = emulator_get_cr,
        .set_cr              = emulator_set_cr,
+       .set_rflags          = emulator_set_rflags,
        .cpl                 = emulator_get_cpl,
        .get_dr              = emulator_get_dr,
        .set_dr              = emulator_set_dr,
@@ -4298,14 +4333,6 @@ static struct x86_emulate_ops emulate_ops = {
        .get_cpuid           = emulator_get_cpuid,
 };
 
-static void cache_all_regs(struct kvm_vcpu *vcpu)
-{
-       kvm_register_read(vcpu, VCPU_REGS_RAX);
-       kvm_register_read(vcpu, VCPU_REGS_RSP);
-       kvm_register_read(vcpu, VCPU_REGS_RIP);
-       vcpu->arch.regs_dirty = ~0;
-}
-
 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 {
        u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
@@ -4332,12 +4359,10 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
                kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
-                             const unsigned long *regs)
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
        memset(&ctxt->twobyte, 0,
-              (void *)&ctxt->regs - (void *)&ctxt->twobyte);
-       memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+              (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
 
        ctxt->fetch.start = 0;
        ctxt->fetch.end = 0;
@@ -4352,14 +4377,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int cs_db, cs_l;
 
-       /*
-        * TODO: fix emulate.c to use guest_read/write_register
-        * instead of direct ->regs accesses, can save hundred cycles
-        * on Intel for instructions that don't read/change RSP, for
-        * for example.
-        */
-       cache_all_regs(vcpu);
-
        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
        ctxt->eflags = kvm_get_rflags(vcpu);
@@ -4371,7 +4388,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
                                                          X86EMUL_MODE_PROT16;
        ctxt->guest_mode = is_guest_mode(vcpu);
 
-       init_decode_cache(ctxt, vcpu->arch.regs);
+       init_decode_cache(ctxt);
        vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
@@ -4391,7 +4408,6 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
                return EMULATE_FAIL;
 
        ctxt->eip = ctxt->_eip;
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
        kvm_rip_write(vcpu, ctxt->eip);
        kvm_set_rflags(vcpu, ctxt->eflags);
 
@@ -4424,13 +4440,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
        gpa_t gpa;
+       pfn_t pfn;
 
        if (tdp_enabled)
                return false;
 
        /*
         * if emulation was due to access to shadowed page table
-        * and it failed try to unshadow page and re-entetr the
+        * and it failed try to unshadow page and re-enter the
         * guest to let CPU execute the instruction.
         */
        if (kvm_mmu_unprotect_page_virt(vcpu, gva))
@@ -4441,8 +4458,17 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
        if (gpa == UNMAPPED_GVA)
                return true; /* let cpu generate fault */
 
-       if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+       /*
+        * Do not retry the unhandleable instruction if it faults on the
+        * readonly host memory, otherwise it will goto a infinite loop:
+        * retry instruction -> write #PF -> emulation fail -> retry
+        * instruction -> ...
+        */
+       pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+       if (!is_error_noslot_pfn(pfn)) {
+               kvm_release_pfn_clean(pfn);
                return true;
+       }
 
        return false;
 }
@@ -4491,6 +4517,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        return true;
 }
 
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                            unsigned long cr2,
                            int emulation_type,
@@ -4539,7 +4568,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
           changes registers values  during IO operation */
        if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
                vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-               memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+               emulator_invalidate_register_cache(ctxt);
        }
 
 restart:
@@ -4561,13 +4590,16 @@ restart:
        } else if (vcpu->arch.pio.count) {
                if (!vcpu->arch.pio.in)
                        vcpu->arch.pio.count = 0;
-               else
+               else {
                        writeback = false;
+                       vcpu->arch.complete_userspace_io = complete_emulated_pio;
+               }
                r = EMULATE_DO_MMIO;
        } else if (vcpu->mmio_needed) {
                if (!vcpu->mmio_is_write)
                        writeback = false;
                r = EMULATE_DO_MMIO;
+               vcpu->arch.complete_userspace_io = complete_emulated_mmio;
        } else if (r == EMULATION_RESTART)
                goto restart;
        else
@@ -4577,7 +4609,6 @@ restart:
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                kvm_set_rflags(vcpu, ctxt->eflags);
                kvm_make_request(KVM_REQ_EVENT, vcpu);
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                kvm_rip_write(vcpu, ctxt->eip);
        } else
@@ -4860,6 +4891,7 @@ int kvm_arch_init(void *opaque)
        if (cpu_has_xsave)
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
+       kvm_lapic_init();
        return 0;
 
 out:
@@ -4997,7 +5029,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        char instruction[3];
@@ -5044,17 +5076,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
                        !kvm_event_needs_reinjection(vcpu);
 }
 
-static void vapic_enter(struct kvm_vcpu *vcpu)
+static int vapic_enter(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
        struct page *page;
 
        if (!apic || !apic->vapic_addr)
-               return;
+               return 0;
 
        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+       if (is_error_page(page))
+               return -EFAULT;
 
        vcpu->arch.apic->vapic_page = page;
+       return 0;
 }
 
 static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -5223,10 +5258,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_deliver_pmi(vcpu);
        }
 
-       r = kvm_mmu_reload(vcpu);
-       if (unlikely(r))
-               goto out;
-
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                inject_pending_event(vcpu);
 
@@ -5242,6 +5273,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                }
        }
 
+       r = kvm_mmu_reload(vcpu);
+       if (unlikely(r)) {
+               goto cancel_injection;
+       }
+
        preempt_disable();
 
        kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5264,9 +5300,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                smp_wmb();
                local_irq_enable();
                preempt_enable();
-               kvm_x86_ops->cancel_injection(vcpu);
                r = 1;
-               goto out;
+               goto cancel_injection;
        }
 
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5330,9 +5365,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (unlikely(vcpu->arch.tsc_always_catchup))
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
-       kvm_lapic_sync_from_vapic(vcpu);
+       if (vcpu->arch.apic_attention)
+               kvm_lapic_sync_from_vapic(vcpu);
 
        r = kvm_x86_ops->handle_exit(vcpu);
+       return r;
+
+cancel_injection:
+       kvm_x86_ops->cancel_injection(vcpu);
+       if (unlikely(vcpu->arch.apic_attention))
+               kvm_lapic_sync_from_vapic(vcpu);
 out:
        return r;
 }
@@ -5347,14 +5389,18 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-               r = kvm_arch_vcpu_reset(vcpu);
+               r = kvm_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        }
 
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-       vapic_enter(vcpu);
+       r = vapic_enter(vcpu);
+       if (r) {
+               srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+               return r;
+       }
 
        r = 1;
        while (r > 0) {
@@ -5416,33 +5462,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        return r;
 }
 
-static int complete_mmio(struct kvm_vcpu *vcpu)
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
 {
-       struct kvm_run *run = vcpu->run;
        int r;
-
-       if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
-               return 1;
-
-       if (vcpu->mmio_needed) {
-               vcpu->mmio_needed = 0;
-               if (!vcpu->mmio_is_write)
-                       memcpy(vcpu->mmio_data + vcpu->mmio_index,
-                              run->mmio.data, 8);
-               vcpu->mmio_index += 8;
-               if (vcpu->mmio_index < vcpu->mmio_size) {
-                       run->exit_reason = KVM_EXIT_MMIO;
-                       run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
-                       memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
-                       run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
-                       run->mmio.is_write = vcpu->mmio_is_write;
-                       vcpu->mmio_needed = 1;
-                       return 0;
-               }
-               if (vcpu->mmio_is_write)
-                       return 1;
-               vcpu->mmio_read_completed = 1;
-       }
        vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5451,6 +5473,60 @@ static int complete_mmio(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+       BUG_ON(!vcpu->arch.pio.count);
+
+       return complete_emulated_io(vcpu);
+}
+
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ *   for each fragment
+ *     write gpa, len
+ *     exit
+ *     copy data
+ *   execute insn
+ *
+ * write:
+ *   for each fragment
+ *      write gpa, len
+ *      copy data
+ *      exit
+ */
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       struct kvm_mmio_fragment *frag;
+
+       BUG_ON(!vcpu->mmio_needed);
+
+       /* Complete previous fragment */
+       frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+       if (!vcpu->mmio_is_write)
+               memcpy(frag->data, run->mmio.data, frag->len);
+       if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+               vcpu->mmio_needed = 0;
+               if (vcpu->mmio_is_write)
+                       return 1;
+               vcpu->mmio_read_completed = 1;
+               return complete_emulated_io(vcpu);
+       }
+       /* Initiate next fragment */
+       ++frag;
+       run->exit_reason = KVM_EXIT_MMIO;
+       run->mmio.phys_addr = frag->gpa;
+       if (vcpu->mmio_is_write)
+               memcpy(run->mmio.data, frag->data, frag->len);
+       run->mmio.len = frag->len;
+       run->mmio.is_write = vcpu->mmio_is_write;
+       vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       return 0;
+}
+
+
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        int r;
@@ -5477,9 +5553,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                }
        }
 
-       r = complete_mmio(vcpu);
-       if (r <= 0)
-               goto out;
+       if (unlikely(vcpu->arch.complete_userspace_io)) {
+               int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+               vcpu->arch.complete_userspace_io = NULL;
+               r = cui(vcpu);
+               if (r <= 0)
+                       goto out;
+       } else
+               WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
        r = __vcpu_run(vcpu);
 
@@ -5497,12 +5578,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
                /*
                 * We are here if userspace calls get_regs() in the middle of
                 * instruction emulation. Registers state needs to be copied
-                * back from emulation context to vcpu. Usrapace shouldn't do
+                * back from emulation context to vcpu. Userspace shouldn't do
                 * that usually, but some bad designed PV devices (vmware
                 * backdoor interface) need this to work
                 */
-               struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-               memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+               emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
        }
        regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5628,21 +5708,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-                   bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+                   int reason, bool has_error_code, u32 error_code)
 {
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        int ret;
 
        init_emulate_ctxt(vcpu);
 
-       ret = emulator_task_switch(ctxt, tss_selector, reason,
+       ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
                                   has_error_code, error_code);
 
        if (ret)
                return EMULATE_FAIL;
 
-       memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
        kvm_rip_write(vcpu, ctxt->eip);
        kvm_set_rflags(vcpu, ctxt->eflags);
        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5694,7 +5773,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        if (mmu_reset_needed)
                kvm_mmu_reset_context(vcpu);
 
-       max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+       max_bits = KVM_NR_INTERRUPTS;
        pending_vec = find_first_bit(
                (const unsigned long *)sregs->interrupt_bitmap, max_bits);
        if (pending_vec < max_bits) {
@@ -5754,13 +5833,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                for (i = 0; i < KVM_NR_DB_REGS; ++i)
                        vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
-               vcpu->arch.switch_db_regs =
-                       (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+               vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
        } else {
                for (i = 0; i < KVM_NR_DB_REGS; i++)
                        vcpu->arch.eff_db[i] = vcpu->arch.db[i];
-               vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
        }
+       kvm_update_dr7(vcpu);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
@@ -5772,7 +5850,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
         */
        kvm_set_rflags(vcpu, rflags);
 
-       kvm_x86_ops->set_guest_debug(vcpu, dbg);
+       kvm_x86_ops->update_db_bp_intercept(vcpu);
 
        r = 0;
 
@@ -5874,7 +5952,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
         */
        kvm_put_guest_xcr0(vcpu);
        vcpu->guest_fpu_loaded = 1;
-       unlazy_fpu(current);
+       __kernel_fpu_begin();
        fpu_restore_checking(&vcpu->arch.guest_fpu);
        trace_kvm_fpu(1);
 }
@@ -5888,6 +5966,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
        vcpu->guest_fpu_loaded = 0;
        fpu_save_init(&vcpu->arch.guest_fpu);
+       __kernel_fpu_end();
        ++vcpu->stat.fpu_reload;
        kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
        trace_kvm_fpu(0);
@@ -5917,8 +5996,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        int r;
 
        vcpu->arch.mtrr_state.have_fixed = 1;
-       vcpu_load(vcpu);
-       r = kvm_arch_vcpu_reset(vcpu);
+       r = vcpu_load(vcpu);
+       if (r)
+               return r;
+       r = kvm_vcpu_reset(vcpu);
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
@@ -5928,9 +6009,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+       int r;
        vcpu->arch.apf.msr_val = 0;
 
-       vcpu_load(vcpu);
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 
@@ -5938,16 +6021,16 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
        vcpu->arch.nmi_injected = false;
 
-       vcpu->arch.switch_db_regs = 0;
        memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
        vcpu->arch.dr6 = DR6_FIXED_1;
        vcpu->arch.dr7 = DR7_FIXED_1;
+       kvm_update_dr7(vcpu);
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        vcpu->arch.apf.msr_val = 0;
@@ -5969,13 +6052,88 @@ int kvm_arch_hardware_enable(void *garbage)
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
        int i;
+       int ret;
+       u64 local_tsc;
+       u64 max_tsc = 0;
+       bool stable, backwards_tsc = false;
 
        kvm_shared_msr_cpu_online();
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       if (vcpu->cpu == smp_processor_id())
-                               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-       return kvm_x86_ops->hardware_enable(garbage);
+       ret = kvm_x86_ops->hardware_enable(garbage);
+       if (ret != 0)
+               return ret;
+
+       local_tsc = native_read_tsc();
+       stable = !check_tsc_unstable();
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       if (!stable && vcpu->cpu == smp_processor_id())
+                               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+                       if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+                               backwards_tsc = true;
+                               if (vcpu->arch.last_host_tsc > max_tsc)
+                                       max_tsc = vcpu->arch.last_host_tsc;
+                       }
+               }
+       }
+
+       /*
+        * Sometimes, even reliable TSCs go backwards.  This happens on
+        * platforms that reset TSC during suspend or hibernate actions, but
+        * maintain synchronization.  We must compensate.  Fortunately, we can
+        * detect that condition here, which happens early in CPU bringup,
+        * before any KVM threads can be running.  Unfortunately, we can't
+        * bring the TSCs fully up to date with real time, as we aren't yet far
+        * enough into CPU bringup that we know how much real time has actually
+        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * variables that haven't been updated yet.
+        *
+        * So we simply find the maximum observed TSC above, then record the
+        * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+        * the adjustment will be applied.  Note that we accumulate
+        * adjustments, in case multiple suspend cycles happen before some VCPU
+        * gets a chance to run again.  In the event that no KVM threads get a
+        * chance to run, we will miss the entire elapsed period, as we'll have
+        * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+        * loose cycle time.  This isn't too big a deal, since the loss will be
+        * uniform across all VCPUs (not to mention the scenario is extremely
+        * unlikely). It is possible that a second hibernate recovery happens
+        * much faster than a first, causing the observed TSC here to be
+        * smaller; this would require additional padding adjustment, which is
+        * why we set last_host_tsc to the local tsc observed here.
+        *
+        * N.B. - this code below runs only on platforms with reliable TSC,
+        * as that is the only way backwards_tsc is set above.  Also note
+        * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+        * have the same delta_cyc adjustment applied if backwards_tsc
+        * is detected.  Note further, this adjustment is only done once,
+        * as we reset last_host_tsc on all VCPUs to stop this from being
+        * called multiple times (one for each physical CPU bringup).
+        *
+        * Platforms with unreliable TSCs don't have to deal with this, they
+        * will be compensated by the logic in vcpu_load, which sets the TSC to
+        * catchup mode.  This will catchup all VCPUs to real time, but cannot
+        * guarantee that they stay in perfect synchronization.
+        */
+       if (backwards_tsc) {
+               u64 delta_cyc = max_tsc - local_tsc;
+               list_for_each_entry(kvm, &vm_list, vm_list) {
+                       kvm_for_each_vcpu(i, vcpu, kvm) {
+                               vcpu->arch.tsc_offset_adjustment += delta_cyc;
+                               vcpu->arch.last_host_tsc = local_tsc;
+                       }
+
+                       /*
+                        * We have to disable TSC offset matching.. if you were
+                        * booting a VM while issuing an S4 host suspend....
+                        * you may have some problem.  Solving this issue is
+                        * left as an exercise to the reader.
+                        */
+                       kvm->arch.last_tsc_nsec = 0;
+                       kvm->arch.last_tsc_write = 0;
+               }
+
+       }
+       return 0;
 }
 
 void kvm_arch_hardware_disable(void *garbage)
@@ -5999,6 +6157,13 @@ void kvm_arch_check_processor_compat(void *rtn)
        kvm_x86_ops->check_processor_compatibility(rtn);
 }
 
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+       return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
+struct static_key kvm_no_apic_vcpu __read_mostly;
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct page *page;
@@ -6031,7 +6196,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                r = kvm_create_lapic(vcpu);
                if (r < 0)
                        goto fail_mmu_destroy;
-       }
+       } else
+               static_key_slow_inc(&kvm_no_apic_vcpu);
 
        vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
                                       GFP_KERNEL);
@@ -6071,6 +6237,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        kvm_mmu_destroy(vcpu);
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        free_page((unsigned long)vcpu->arch.pio_data);
+       if (!irqchip_in_kernel(vcpu->kvm))
+               static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -6083,15 +6251,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+       /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+       set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+               &kvm->arch.irq_sources_bitmap);
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+       mutex_init(&kvm->arch.apic_map_lock);
 
        return 0;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
-       vcpu_load(vcpu);
+       int r;
+       r = vcpu_load(vcpu);
+       BUG_ON(r);
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 }
@@ -6135,6 +6309,86 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                put_page(kvm->arch.apic_access_page);
        if (kvm->arch.ept_identity_pagetable)
                put_page(kvm->arch.ept_identity_pagetable);
+       kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                          struct kvm_memory_slot *dont)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+                       kvm_kvfree(free->arch.rmap[i]);
+                       free->arch.rmap[i] = NULL;
+               }
+               if (i == 0)
+                       continue;
+
+               if (!dont || free->arch.lpage_info[i - 1] !=
+                            dont->arch.lpage_info[i - 1]) {
+                       kvm_kvfree(free->arch.lpage_info[i - 1]);
+                       free->arch.lpage_info[i - 1] = NULL;
+               }
+       }
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               unsigned long ugfn;
+               int lpages;
+               int level = i + 1;
+
+               lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                     slot->base_gfn, level) + 1;
+
+               slot->arch.rmap[i] =
+                       kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+               if (!slot->arch.rmap[i])
+                       goto out_free;
+               if (i == 0)
+                       continue;
+
+               slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+                                       sizeof(*slot->arch.lpage_info[i - 1]));
+               if (!slot->arch.lpage_info[i - 1])
+                       goto out_free;
+
+               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i - 1][0].write_count = 1;
+               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+               ugfn = slot->userspace_addr >> PAGE_SHIFT;
+               /*
+                * If the gfn and userspace address are not aligned wrt each
+                * other, or if explicitly asked to, disable large page
+                * support for this slot
+                */
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                   !kvm_largepages_enabled()) {
+                       unsigned long j;
+
+                       for (j = 0; j < lpages; ++j)
+                               slot->arch.lpage_info[i - 1][j].write_count = 1;
+               }
+       }
+
+       return 0;
+
+out_free:
+       for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+               kvm_kvfree(slot->arch.rmap[i]);
+               slot->arch.rmap[i] = NULL;
+               if (i == 0)
+                       continue;
+
+               kvm_kvfree(slot->arch.lpage_info[i - 1]);
+               slot->arch.lpage_info[i - 1] = NULL;
+       }
+       return -ENOMEM;
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6151,19 +6405,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
        /*To keep backward compatibility with older userspace,
-        *x86 needs to hanlde !user_alloc case.
+        *x86 needs to handle !user_alloc case.
         */
        if (!user_alloc) {
-               if (npages && !old.rmap) {
+               if (npages && !old.npages) {
                        unsigned long userspace_addr;
 
-                       down_write(&current->mm->mmap_sem);
-                       userspace_addr = do_mmap(NULL, 0,
+                       userspace_addr = vm_mmap(NULL, 0,
                                                 npages * PAGE_SIZE,
                                                 PROT_READ | PROT_WRITE,
                                                 map_flags,
                                                 0);
-                       up_write(&current->mm->mmap_sem);
 
                        if (IS_ERR((void *)userspace_addr))
                                return PTR_ERR((void *)userspace_addr);
@@ -6184,13 +6436,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
        int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 
-       if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+       if (!user_alloc && !old.user_alloc && old.npages && !npages) {
                int ret;
 
-               down_write(&current->mm->mmap_sem);
-               ret = do_munmap(current->mm, old.userspace_addr,
+               ret = vm_munmap(old.userspace_addr,
                                old.npages * PAGE_SIZE);
-               up_write(&current->mm->mmap_sem);
                if (ret < 0)
                        printk(KERN_WARNING
                               "kvm_vm_ioctl_set_memory_region: "
@@ -6205,14 +6455,28 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        spin_unlock(&kvm->mmu_lock);
+       /*
+        * If memory slot is created, or moved, we need to clear all
+        * mmio sptes.
+        */
+       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+               kvm_mmu_zap_all(kvm);
+               kvm_reload_remote_mmus(kvm);
+       }
 }
 
-void kvm_arch_flush_shadow(struct kvm *kvm)
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
        kvm_mmu_zap_all(kvm);
        kvm_reload_remote_mmus(kvm);
 }
 
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+{
+       kvm_arch_flush_shadow_all(kvm);
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
@@ -6224,21 +6488,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
                 kvm_cpu_has_interrupt(vcpu));
 }
 
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
-       int me;
-       int cpu = vcpu->cpu;
-
-       if (waitqueue_active(&vcpu->wq)) {
-               wake_up_interruptible(&vcpu->wq);
-               ++vcpu->stat.halt_wakeup;
-       }
-
-       me = get_cpu();
-       if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-               if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
-                       smp_send_reschedule(cpu);
-       put_cpu();
+       return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 }
 
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -6406,6 +6658,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                kvm_inject_page_fault(vcpu, &fault);
        }
        vcpu->arch.apf.halted = false;
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 }
 
 bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)