Merge branch 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux-2.6.git] / arch / x86 / kvm / x86.c
index 633ccc7..be451ee 100644 (file)
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/cpufreq.h>
+#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
+#define CREATE_TRACE_POINTS
+#include "trace.h"
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/mce.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                              \
                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+#define KVM_MAX_MCE_BANKS 32
+#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
+
 /* EFER defaults:
  * - enable syscall per default because its emulated by KVM
  * - enable LME and LMA per default on 64 bit KVM
@@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                    struct kvm_cpuid_entry2 __user *entries);
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function, u32 index);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
+int ignore_msrs = 0;
+module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
        { "pf_guest", VCPU_STAT(pf_guest) },
@@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector)
        if (selector == 0)
                return 0;
 
-       asm("sgdt %0" : "=m"(gdt));
+       kvm_get_gdt(&gdt);
        table_base = gdt.base;
 
        if (selector & 4) {           /* from ldt */
-               u16 ldt_selector;
+               u16 ldt_selector = kvm_read_ldt();
 
-               asm("sldt %0" : "=g"(ldt_selector));
                table_base = segment_base(ldt_selector);
        }
        d = (struct desc_struct *)(table_base + (selector & ~7));
-       v = d->base0 | ((unsigned long)d->base1 << 16) |
-               ((unsigned long)d->base2 << 24);
+       v = get_desc_base(d);
 #ifdef CONFIG_X86_64
        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
        ++vcpu->stat.pf_guest;
 
        if (vcpu->arch.exception.pending) {
-               if (vcpu->arch.exception.nr == PF_VECTOR) {
-                       printk(KERN_DEBUG "kvm: inject_page_fault:"
-                                       " double fault 0x%lx\n", addr);
-                       vcpu->arch.exception.nr = DF_VECTOR;
-                       vcpu->arch.exception.error_code = 0;
-               } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+               switch(vcpu->arch.exception.nr) {
+               case DF_VECTOR:
                        /* triple fault -> shutdown */
                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return;
+               case PF_VECTOR:
+                       vcpu->arch.exception.nr = DF_VECTOR;
+                       vcpu->arch.exception.error_code = 0;
+                       return;
+               default:
+                       /* replace previous exception with a new one in a hope
+                          that instruction re-execution will regenerate lost
+                          exception */
+                       vcpu->arch.exception.pending = false;
+                       break;
                }
-               return;
        }
        vcpu->arch.cr2 = addr;
        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
-static void __queue_exception(struct kvm_vcpu *vcpu)
+/*
+ * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 {
-       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-                                    vcpu->arch.exception.has_error_code,
-                                    vcpu->arch.exception.error_code);
+       if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+               return true;
+       kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+       return false;
 }
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
 
 /*
  * Load the pae pdptrs.  Return true is they are all valid.
@@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
                goto out;
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_pte(pdpte[i]) &&
+               if (is_present_gpte(pdpte[i]) &&
                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
                        ret = 0;
                        goto out;
@@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
        ret = 1;
 
        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_avail);
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_dirty);
 out:
 
        return ret;
@@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
        if (is_long_mode(vcpu) || !is_pae(vcpu))
                return false;
 
+       if (!test_bit(VCPU_EXREG_PDPTR,
+                     (unsigned long *)&vcpu->arch.regs_avail))
+               return true;
+
        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
        if (r < 0)
                goto out;
@@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
-       KVMTRACE_1D(LMSW, vcpu,
-                   (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
-                   handler);
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -466,7 +492,7 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-       MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+       MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
@@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
-                         &vcpu->hv_clock.tsc_timestamp);
+       kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
        ktime_get_ts(&ts);
        local_irq_restore(flags);
 
@@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        return 0;
 }
 
+static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+
+       switch (msr) {
+       case MSR_IA32_MCG_STATUS:
+               vcpu->arch.mcg_status = data;
+               break;
+       case MSR_IA32_MCG_CTL:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               if (data != 0 && data != ~(u64)0)
+                       return -1;
+               vcpu->arch.mcg_ctl = data;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       if ((offset & 0x3) == 0 &&
+                           data != 0 && data != ~(u64)0)
+                               return -1;
+                       vcpu->arch.mce_banks[offset] = data;
+                       break;
+               }
+               return 1;
+       }
+       return 0;
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
        switch (msr) {
        case MSR_EFER:
                set_efer(vcpu, data);
                break;
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __func__, data);
+       case MSR_K7_HWCR:
+               data &= ~(u64)0x40;     /* ignore flush filter disable */
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+                               data);
+                       return 1;
+               }
                break;
-       case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __func__, data);
+       case MSR_FAM10H_MMIO_CONF_BASE:
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+                               "0x%llx\n", data);
+                       return 1;
+               }
                break;
-       case MSR_IA32_MCG_CTL:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-                       __func__, data);
+       case MSR_AMD64_NB_CFG:
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
@@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
        case MSR_VM_HSAVE_PA:
+       case MSR_AMD64_PATCH_LOADER:
                break;
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
                kvm_set_apic_base(vcpu, data);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_write(vcpu, msr, data);
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                kvm_request_guest_time_update(vcpu);
                break;
        }
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return set_msr_mce(vcpu, msr, data);
+
+       /* Performance counters are not protected by a CPUID bit,
+        * so we should check all of them in the generic path for the sake of
+        * cross vendor migration.
+        * Writing a zero into the event select MSRs disables them,
+        * which we perfectly emulate ;-). Any other value should be at least
+        * reported, some guests depend on them.
+        */
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               if (data != 0)
+                       pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                               "0x%x data 0x%llx\n", msr, data);
+               break;
+       /* at least RHEL 4 unconditionally writes to the perfctr registers,
+        * so we ignore writes to make it happy.
+        */
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                       "0x%x data 0x%llx\n", msr, data);
+               break;
        default:
-               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       break;
+               }
        }
        return 0;
 }
@@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        return 0;
 }
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
        u64 data;
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
 
        switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
+               data = 0;
+               break;
        case MSR_IA32_MCG_CAP:
+               data = vcpu->arch.mcg_cap;
+               break;
        case MSR_IA32_MCG_CTL:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_MC0_MISC+20:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               data = vcpu->arch.mcg_ctl;
+               break;
+       case MSR_IA32_MCG_STATUS:
+               data = vcpu->arch.mcg_status;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       data = vcpu->arch.mce_banks[offset];
+                       break;
+               }
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+       u64 data;
+
+       switch (msr) {
+       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_EBL_CR_POWERON:
        case MSR_IA32_DEBUGCTLMSR:
@@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_LASTBRANCHTOIP:
        case MSR_IA32_LASTINTFROMIP:
        case MSR_IA32_LASTINTTOIP:
+       case MSR_K8_SYSCFG:
+       case MSR_K7_HWCR:
        case MSR_VM_HSAVE_PA:
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
        case MSR_P6_EVNTSEL0:
        case MSR_P6_EVNTSEL1:
        case MSR_K7_EVNTSEL0:
+       case MSR_K7_PERFCTR0:
+       case MSR_K8_INT_PENDING_MSG:
+       case MSR_AMD64_NB_CFG:
+       case MSR_FAM10H_MMIO_CONF_BASE:
                data = 0;
                break;
        case MSR_MTRRcap:
@@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_APICBASE:
                data = kvm_get_apic_base(vcpu);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_read(vcpu, msr, pdata);
+               break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_KVM_SYSTEM_TIME:
                data = vcpu->arch.time;
                break;
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return get_msr_mce(vcpu, msr, pdata);
        default:
-               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+                       data = 0;
+               }
+               break;
        }
        *pdata = data;
        return 0;
@@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_REINJECT_CONTROL:
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_ASSIGN_DEV_IRQ:
+       case KVM_CAP_IRQFD:
+       case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_PIT2:
+       case KVM_CAP_PIT_STATE2:
+       case KVM_CAP_SET_IDENTITY_MAP_ADDR:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IOMMU:
                r = iommu_found();
                break;
+       case KVM_CAP_MCE:
+               r = KVM_MAX_MCE_BANKS;
+               break;
        default:
                r = 0;
                break;
@@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+               u64 mce_cap;
+
+               mce_cap = KVM_MCE_CAP_SUPPORTED;
+               r = -EFAULT;
+               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+                       goto out;
+               r = 0;
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
        vcpu->arch.cpuid_nent = cpuid->nent;
        cpuid_fix_nx_cap(vcpu);
        r = 0;
+       kvm_apic_set_version(vcpu);
 
 out_free:
        vfree(cpuid_entries);
@@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                goto out;
        vcpu->arch.cpuid_nent = cpuid->nent;
+       kvm_apic_set_version(vcpu);
        return 0;
 
 out:
@@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                         u32 index, int *nent, int maxnent)
 {
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+       unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
 #ifdef CONFIG_X86_64
        unsigned f_lm = F(LM);
 #else
@@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
                F(PAT) | F(PSE36) | 0 /* Reserved */ |
                f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-               F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+               F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
        /* cpuid 1.ecx */
        const u32 kvm_supported_word4_x86_features =
@@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
                0 /* Reserved, DCA */ | F(XMM4_1) |
-               F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+               F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
                0 /* Reserved, XSAVE, OSXSAVE */;
        /* cpuid 0x80000001.ecx */
        const u32 kvm_supported_word6_x86_features =
@@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 1:
                entry->edx &= kvm_supported_word0_x86_features;
                entry->ecx &= kvm_supported_word4_x86_features;
+               /* we support x2apic emulation even if host does not support
+                * it since we emulate x2apic in software */
+               entry->ecx |= F(X2APIC);
                break;
        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
         * may return different values. This forces us to get_cpu() before
@@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
        for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
                do_cpuid_ent(&cpuid_entries[nent], func, 0,
                             &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
+
        r = -EFAULT;
        if (copy_to_user(entries, cpuid_entries,
                         nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
        vcpu_load(vcpu);
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
        kvm_apic_post_state_restore(vcpu);
+       update_cr8_intercept(vcpu);
        vcpu_put(vcpu);
 
        return 0;
@@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+                                       u64 mcg_cap)
+{
+       int r;
+       unsigned bank_num = mcg_cap & 0xff, bank;
+
+       r = -EINVAL;
+       if (!bank_num)
+               goto out;
+       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+               goto out;
+       r = 0;
+       vcpu->arch.mcg_cap = mcg_cap;
+       /* Init IA32_MCG_CTL to all 1s */
+       if (mcg_cap & MCG_CTL_P)
+               vcpu->arch.mcg_ctl = ~(u64)0;
+       /* Init IA32_MCi_CTL to all 1s */
+       for (bank = 0; bank < bank_num; bank++)
+               vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+                                     struct kvm_x86_mce *mce)
+{
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+       u64 *banks = vcpu->arch.mce_banks;
+
+       if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+               return -EINVAL;
+       /*
+        * if IA32_MCG_CTL is not all 1s, the uncorrected error
+        * reporting is disabled
+        */
+       if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+           vcpu->arch.mcg_ctl != ~(u64)0)
+               return 0;
+       banks += 4 * mce->bank;
+       /*
+        * if IA32_MCi_CTL is not all 1s, the uncorrected error
+        * reporting is disabled for the bank
+        */
+       if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+               return 0;
+       if (mce->status & MCI_STATUS_UC) {
+               if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+                   !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+                       printk(KERN_DEBUG "kvm: set_mce: "
+                              "injects mce exception while "
+                              "previous one is in progress!\n");
+                       set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return 0;
+               }
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               vcpu->arch.mcg_status = mce->mcg_status;
+               banks[1] = mce->status;
+               kvm_queue_exception(vcpu, MC_VECTOR);
+       } else if (!(banks[1] & MCI_STATUS_VAL)
+                  || !(banks[1] & MCI_STATUS_UC)) {
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               banks[1] = mce->status;
+       } else
+               banks[1] |= MCI_STATUS_OVER;
+       return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
 {
@@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
                break;
        }
+       case KVM_X86_SETUP_MCE: {
+               u64 mcg_cap;
+
+               r = -EFAULT;
+               if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+               break;
+       }
+       case KVM_X86_SET_MCE: {
+               struct kvm_x86_mce mce;
+
+               r = -EFAULT;
+               if (copy_from_user(&mce, argp, sizeof mce))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
        return ret;
 }
 
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+                                             u64 ident_addr)
+{
+       kvm->arch.ept_identity_map_addr = ident_addr;
+       return 0;
+}
+
 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
                                          u32 kvm_nr_mmu_pages)
 {
@@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_PIC_MASTER:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[0],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_PIC_SLAVE:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[1],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_IOAPIC:
+               mutex_lock(&kvm->irq_lock);
                memcpy(ioapic_irqchip(kvm),
                        &chip->chip.ioapic,
                        sizeof(struct kvm_ioapic_state));
+               mutex_unlock(&kvm->irq_lock);
                break;
        default:
                r = -EINVAL;
@@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
        int r = 0;
 
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
 }
 
@@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
        int r = 0;
 
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-       kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+       kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+       int r = 0;
+
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+               sizeof(ps->channels));
+       ps->flags = kvm->arch.vpit->pit_state.flags;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+       int r = 0, start = 0;
+       u32 prev_legacy, cur_legacy;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       if (!prev_legacy && cur_legacy)
+               start = 1;
+       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+              sizeof(kvm->arch.vpit->pit_state.channels));
+       kvm->arch.vpit->pit_state.flags = ps->flags;
+       kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
 }
 
@@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 {
        if (!kvm->arch.vpit)
                return -ENXIO;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return 0;
 }
 
@@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                spin_lock(&kvm->mmu_lock);
                kvm_mmu_slot_remove_write_access(kvm, log->slot);
                spin_unlock(&kvm->mmu_lock);
-               kvm_flush_remote_tlbs(kvm);
                memslot = &kvm->memslots[log->slot];
                n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
                memset(memslot->dirty_bitmap, 0, n);
@@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
         */
        union {
                struct kvm_pit_state ps;
+               struct kvm_pit_state2 ps2;
                struct kvm_memory_alias alias;
+               struct kvm_pit_config pit_config;
        } u;
 
        switch (ioctl) {
@@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (r < 0)
                        goto out;
                break;
+       case KVM_SET_IDENTITY_MAP_ADDR: {
+               u64 ident_addr;
+
+               r = -EFAULT;
+               if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+                       goto out;
+               r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+               if (r < 0)
+                       goto out;
+               break;
+       }
        case KVM_SET_MEMORY_REGION: {
                struct kvm_memory_region kvm_mem;
                struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
                }
                break;
        case KVM_CREATE_PIT:
-               mutex_lock(&kvm->lock);
+               u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+               goto create_pit;
+       case KVM_CREATE_PIT2:
+               r = -EFAULT;
+               if (copy_from_user(&u.pit_config, argp,
+                                  sizeof(struct kvm_pit_config)))
+                       goto out;
+       create_pit:
+               down_write(&kvm->slots_lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
                r = -ENOMEM;
-               kvm->arch.vpit = kvm_create_pit(kvm);
+               kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->lock);
+               up_write(&kvm->slots_lock);
                break;
        case KVM_IRQ_LINE_STATUS:
        case KVM_IRQ_LINE: {
@@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
-                       mutex_lock(&kvm->lock);
+                       mutex_lock(&kvm->irq_lock);
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                        irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->lock);
+                       mutex_unlock(&kvm->irq_lock);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
@@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_GET_PIT2: {
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_PIT2: {
+               r = -EFAULT;
+               if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+                       goto out;
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
        case KVM_REINJECT_CONTROL: {
                struct kvm_reinject_control control;
                r =  -EFAULT;
@@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void)
        num_msrs_to_save = j;
 }
 
-/*
- * Only apic need an MMIO device hook, so shortcut now..
- */
-static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+                          const void *v)
 {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
 
-       if (vcpu->arch.apic) {
-               dev = &vcpu->arch.apic->dev;
-               if (dev->in_range(dev, addr, len, is_write))
-                       return dev;
-       }
-       return NULL;
+       return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
-
-static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
 
-       dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
-       if (dev == NULL)
-               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
-                                         is_write);
-       return dev;
+       return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
 }
 
 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr,
                                  unsigned int bytes,
                                  struct kvm_vcpu *vcpu)
 {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
 
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+                              vcpu->mmio_phys_addr, *(u64 *)val);
                vcpu->mmio_read_completed = 0;
                return X86EMUL_CONTINUE;
        }
@@ -2197,14 +2552,12 @@ mmio:
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
-       if (mmio_dev) {
-               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
+       if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
                return X86EMUL_CONTINUE;
        }
-       mutex_unlock(&vcpu->kvm->lock);
+
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
@@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                                           unsigned int bytes,
                                           struct kvm_vcpu *vcpu)
 {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
 
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
                return X86EMUL_CONTINUE;
 
 mmio:
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
-       if (mmio_dev) {
-               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
+       if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
                return X86EMUL_CONTINUE;
-       }
-       mutex_unlock(&vcpu->kvm->lock);
 
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
@@ -2343,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
-       KVMTRACE_0D(CLTS, vcpu, handler);
        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
        return X86EMUL_CONTINUE;
 }
@@ -2420,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
        kvm_clear_exception_queue(vcpu);
        vcpu->arch.mmio_fault_cr2 = cr2;
        /*
-        * TODO: fix x86_emulate.c to use guest_read/write_register
+        * TODO: fix emulate.c to use guest_read/write_register
         * instead of direct ->regs accesses, can save hundred cycles
         * on Intel for instructions that don't read/change RSP, for
         * for example.
@@ -2444,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
-               /* Reject the instructions other than VMCALL/VMMCALL when
-                * try to emulate invalid opcode */
+               /* Only allow emulation of specific instructions on #UD
+                * (namely VMMCALL, sysenter, sysexit, syscall)*/
                c = &vcpu->arch.emulate_ctxt.decode;
-               if ((emulation_type & EMULTYPE_TRAP_UD) &&
-                   (!(c->twobyte && c->b == 0x01 &&
-                     (c->modrm_reg == 0 || c->modrm_reg == 3) &&
-                      c->modrm_mod == 3 && c->modrm_rm == 1)))
-                       return EMULATE_FAIL;
+               if (emulation_type & EMULTYPE_TRAP_UD) {
+                       if (!c->twobyte)
+                               return EMULATE_FAIL;
+                       switch (c->b) {
+                       case 0x01: /* VMMCALL */
+                               if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x34: /* sysenter */
+                       case 0x35: /* sysexit */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x05: /* syscall */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       default:
+                               return EMULATE_FAIL;
+                       }
+
+                       if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+                               return EMULATE_FAIL;
+               }
 
                ++vcpu->stat.insn_emulation;
                if (r)  {
@@ -2571,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kernel_pio(struct kvm_io_device *pio_dev,
-                      struct kvm_vcpu *vcpu,
-                      void *pd)
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
        /* TODO: String I/O for in kernel device */
+       int r;
 
-       mutex_lock(&vcpu->kvm->lock);
        if (vcpu->arch.pio.in)
-               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-                                 vcpu->arch.pio.size,
-                                 pd);
+               r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                   vcpu->arch.pio.size, pd);
        else
-               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-                                  vcpu->arch.pio.size,
-                                  pd);
-       mutex_unlock(&vcpu->kvm->lock);
+               r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                    vcpu->arch.pio.size, pd);
+       return r;
 }
 
-static void pio_string_write(struct kvm_io_device *pio_dev,
-                            struct kvm_vcpu *vcpu)
+static int pio_string_write(struct kvm_vcpu *vcpu)
 {
        struct kvm_pio_request *io = &vcpu->arch.pio;
        void *pd = vcpu->arch.pio_data;
-       int i;
+       int i, r = 0;
 
-       mutex_lock(&vcpu->kvm->lock);
        for (i = 0; i < io->cur_count; i++) {
-               kvm_iodevice_write(pio_dev, io->port,
-                                  io->size,
-                                  pd);
+               if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+                                    io->port, io->size, pd)) {
+                       r = -EOPNOTSUPP;
+                       break;
+               }
                pd += io->size;
        }
-       mutex_unlock(&vcpu->kvm->lock);
-}
-
-static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-                                              gpa_t addr, int len,
-                                              int is_write)
-{
-       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+       return r;
 }
 
 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned port)
 {
-       struct kvm_io_device *pio_dev;
        unsigned long val;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2630,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.down = 0;
        vcpu->arch.pio.rep = 0;
 
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, 1);
 
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
 
-       pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
-       if (pio_dev) {
-               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
                complete_pio(vcpu);
                return 1;
        }
@@ -2656,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 {
        unsigned now, in_page;
        int ret = 0;
-       struct kvm_io_device *pio_dev;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2669,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        vcpu->arch.pio.down = down;
        vcpu->arch.pio.rep = rep;
 
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, count);
 
        if (!count) {
                kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2704,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 
        vcpu->arch.pio.guest_gva = address;
 
-       pio_dev = vcpu_find_pio_dev(vcpu, port,
-                                   vcpu->arch.pio.cur_count,
-                                   !vcpu->arch.pio.in);
        if (!vcpu->arch.pio.in) {
                /* string PIO write */
                ret = pio_copy_data(vcpu);
@@ -2714,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                        kvm_inject_gp(vcpu, 0);
                        return 1;
                }
-               if (ret == 0 && pio_dev) {
-                       pio_string_write(pio_dev, vcpu);
+               if (ret == 0 && !pio_string_write(vcpu)) {
                        complete_pio(vcpu);
                        if (vcpu->arch.pio.count == 0)
                                ret = 1;
                }
-       } else if (pio_dev)
-               pr_unimpl(vcpu, "no string pio read support yet, "
-                      "port %x size %d count %ld\n",
-                       port, size, count);
+       }
+       /* no string PIO read support yet */
 
        return ret;
 }
@@ -2756,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (!vcpu)
-                               continue;
+               kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
                                continue;
                        if (!kvm_request_guest_time_update(vcpu))
@@ -2852,7 +3185,6 @@ void kvm_arch_exit(void)
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.halt_exits;
-       KVMTRACE_0D(HLT, vcpu, handler);
        if (irqchip_in_kernel(vcpu->kvm)) {
                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
                return 1;
@@ -2883,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 
-       KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+       trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
        if (!is_long_mode(vcpu)) {
                nr &= 0xFFFFFFFF;
@@ -2893,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                a3 &= 0xFFFFFFFF;
        }
 
+       if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+               ret = -KVM_EPERM;
+               goto out;
+       }
+
        switch (nr) {
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
@@ -2904,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                ret = -KVM_ENOSYS;
                break;
        }
+out:
        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
        ++vcpu->stat.hypercalls;
        return r;
@@ -2983,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
                return 0;
        }
-       KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
-                   (u32)((u64)value >> 32), handler);
 
        return value;
 }
@@ -2992,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
                     unsigned long *rflags)
 {
-       KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
-                   (u32)((u64)val >> 32), handler);
-
        switch (cr) {
        case 0:
                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3104,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
                kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
        }
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-       KVMTRACE_5D(CPUID, vcpu, function,
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+       trace_kvm_cpuid(function,
+                       kvm_register_read(vcpu, VCPU_REGS_RAX),
+                       kvm_register_read(vcpu, VCPU_REGS_RBX),
+                       kvm_register_read(vcpu, VCPU_REGS_RCX),
+                       kvm_register_read(vcpu, VCPU_REGS_RDX));
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -3174,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        if (!kvm_x86_ops->update_cr8_intercept)
                return;
 
+       if (!vcpu->arch.apic)
+               return;
+
        if (!vcpu->arch.apic->vapic_addr)
                max_irr = kvm_lapic_find_highest_irr(vcpu);
        else
@@ -3187,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
        /* try to reinject previous events if any */
+       if (vcpu->arch.exception.pending) {
+               kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                         vcpu->arch.exception.has_error_code,
+                                         vcpu->arch.exception.error_code);
+               return;
+       }
+
        if (vcpu->arch.nmi_injected) {
                kvm_x86_ops->set_nmi(vcpu);
                return;
@@ -3266,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        smp_mb__after_clear_bit();
 
        if (vcpu->requests || need_resched() || signal_pending(current)) {
+               set_bit(KVM_REQ_KICK, &vcpu->requests);
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
 
-       if (vcpu->arch.exception.pending)
-               __queue_exception(vcpu);
-       else
-               inject_pending_irq(vcpu, kvm_run);
+       inject_pending_event(vcpu, kvm_run);
 
        /* enable NMI/IRQ window open exits if needed */
        if (vcpu->arch.nmi_pending)
@@ -3292,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        kvm_guest_enter();
 
-       get_debugreg(vcpu->arch.host_dr6, 6);
-       get_debugreg(vcpu->arch.host_dr7, 7);
        if (unlikely(vcpu->arch.switch_db_regs)) {
-               get_debugreg(vcpu->arch.host_db[0], 0);
-               get_debugreg(vcpu->arch.host_db[1], 1);
-               get_debugreg(vcpu->arch.host_db[2], 2);
-               get_debugreg(vcpu->arch.host_db[3], 3);
-
                set_debugreg(0, 7);
                set_debugreg(vcpu->arch.eff_db[0], 0);
                set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3307,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                set_debugreg(vcpu->arch.eff_db[3], 3);
        }
 
-       KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+       trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu, kvm_run);
 
-       if (unlikely(vcpu->arch.switch_db_regs)) {
-               set_debugreg(0, 7);
-               set_debugreg(vcpu->arch.host_db[0], 0);
-               set_debugreg(vcpu->arch.host_db[1], 1);
-               set_debugreg(vcpu->arch.host_db[2], 2);
-               set_debugreg(vcpu->arch.host_db[3], 3);
+       if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
+               set_debugreg(current->thread.debugreg0, 0);
+               set_debugreg(current->thread.debugreg1, 1);
+               set_debugreg(current->thread.debugreg2, 2);
+               set_debugreg(current->thread.debugreg3, 3);
+               set_debugreg(current->thread.debugreg6, 6);
+               set_debugreg(current->thread.debugreg7, 7);
        }
-       set_debugreg(vcpu->arch.host_dr6, 6);
-       set_debugreg(vcpu->arch.host_dr7, 7);
 
        set_bit(KVM_REQ_KICK, &vcpu->requests);
        local_irq_enable();
@@ -3648,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu,
 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
                                   struct kvm_segment *kvm_desct)
 {
-       kvm_desct->base = seg_desc->base0;
-       kvm_desct->base |= seg_desc->base1 << 16;
-       kvm_desct->base |= seg_desc->base2 << 24;
-       kvm_desct->limit = seg_desc->limit0;
-       kvm_desct->limit |= seg_desc->limit << 16;
+       kvm_desct->base = get_desc_base(seg_desc);
+       kvm_desct->limit = get_desc_limit(seg_desc);
        if (seg_desc->g) {
                kvm_desct->limit <<= 12;
                kvm_desct->limit |= 0xfff;
@@ -3696,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                         struct desc_struct *seg_desc)
 {
-       gpa_t gpa;
        struct descriptor_table dtable;
        u16 index = selector >> 3;
 
@@ -3706,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
                return 1;
        }
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
-       gpa += index * 8;
-       return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
+       return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
 }
 
 /* allowed just for 8 bytes segments */
 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                         struct desc_struct *seg_desc)
 {
-       gpa_t gpa;
        struct descriptor_table dtable;
        u16 index = selector >> 3;
 
@@ -3723,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 
        if (dtable.limit < index * 8 + 7)
                return 1;
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
-       gpa += index * 8;
-       return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
+       return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
 }
 
 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
                             struct desc_struct *seg_desc)
 {
-       u32 base_addr;
-
-       base_addr = seg_desc->base0;
-       base_addr |= (seg_desc->base1 << 16);
-       base_addr |= (seg_desc->base2 << 24);
+       u32 base_addr = get_desc_base(seg_desc);
 
        return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
 }
@@ -3780,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
        return 0;
 }
 
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+       return (seg != VCPU_SREG_LDTR) &&
+               (seg != VCPU_SREG_TR) &&
+               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+}
+
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                int type_bits, int seg)
 {
        struct kvm_segment kvm_seg;
 
-       if (!(vcpu->arch.cr0 & X86_CR0_PE))
+       if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
                return kvm_load_realmode_segment(vcpu, selector, seg);
        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
                return 1;
@@ -4024,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
                }
        }
 
-       if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+       if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
                kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
                return 1;
        }
@@ -4094,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
        vcpu->arch.cr2 = sregs->cr2;
        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-
-       down_read(&vcpu->kvm->slots_lock);
-       if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
-               vcpu->arch.cr3 = sregs->cr3;
-       else
-               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
-       up_read(&vcpu->kvm->slots_lock);
+       vcpu->arch.cr3 = sregs->cr3;
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
@@ -4142,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
+       update_cr8_intercept(vcpu);
+
        /* Older userspace won't unhalt the vcpu on reset. */
-       if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+       if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
            !(vcpu->arch.cr0 & X86_CR0_PE))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4414,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        kvm = vcpu->kvm;
 
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4436,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                        goto fail_mmu_destroy;
        }
 
+       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+                                      GFP_KERNEL);
+       if (!vcpu->arch.mce_banks) {
+               r = -ENOMEM;
+               goto fail_mmu_destroy;
+       }
+       vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
        return 0;
 
 fail_mmu_destroy:
@@ -4483,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 static void kvm_free_vcpus(struct kvm *kvm)
 {
        unsigned int i;
+       struct kvm_vcpu *vcpu;
 
        /*
         * Unpin any mmu pages first.
         */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_unload_vcpu_mmu(vcpu);
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_free(vcpu);
+
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
 
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -4573,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 
        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        spin_unlock(&kvm->mmu_lock);
-       kvm_flush_remote_tlbs(kvm);
 
        return 0;
 }
@@ -4587,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-              || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-              || vcpu->arch.nmi_pending;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || vcpu->arch.nmi_pending ||
+               (kvm_arch_interrupt_allowed(vcpu) &&
+                kvm_cpu_has_interrupt(vcpu));
 }
 
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4612,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        return kvm_x86_ops->interrupt_allowed(vcpu);
 }
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);