KVM: VMX: Separate saving pre-realmode state from setting segments
[linux-3.10.git] / arch / x86 / kvm / vmx.c
index 82ab1fb..4e49caf 100644 (file)
@@ -74,7 +74,7 @@ module_param_named(unrestricted_guest,
 static bool __read_mostly enable_ept_ad_bits = 1;
 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
 
-static bool __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 static bool __read_mostly vmm_exclusive = 1;
@@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 {
        struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+       if (is_error_page(page))
                return NULL;
-       }
+
        return page;
 }
 
@@ -861,6 +860,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
                SECONDARY_EXEC_RDTSCP;
 }
 
+static inline bool cpu_has_vmx_invpcid(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
 static inline bool cpu_has_virtual_nmis(void)
 {
        return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1337,7 +1342,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
        guest_efer = vmx->vcpu.arch.efer;
 
        /*
-        * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+        * NX is emulated; LMA and LME handled by hardware; SCE meaningless
         * outside long mode
         */
        ignore_bits = EFER_NX | EFER_SCE;
@@ -1482,13 +1487,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
                loadsegment(ds, vmx->host_state.ds_sel);
                loadsegment(es, vmx->host_state.es_sel);
        }
-#else
-       /*
-        * The sysexit path does not restore ds/es, so we must set them to
-        * a reasonable value ourselves.
-        */
-       loadsegment(ds, __USER_DS);
-       loadsegment(es, __USER_DS);
 #endif
        reload_tss();
 #ifdef CONFIG_X86_64
@@ -1751,6 +1749,11 @@ static bool vmx_rdtscp_supported(void)
        return cpu_has_vmx_rdtscp();
 }
 
+static bool vmx_invpcid_supported(void)
+{
+       return cpu_has_vmx_invpcid() && enable_ept;
+}
+
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -1987,7 +1990,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
-               CPU_BASED_RDPMC_EXITING |
+               CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        /*
         * We can allow some features even when not supported by the
@@ -2470,7 +2473,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_ENABLE_EPT |
                        SECONDARY_EXEC_UNRESTRICTED_GUEST |
                        SECONDARY_EXEC_PAUSE_LOOP_EXITING |
-                       SECONDARY_EXEC_RDTSCP;
+                       SECONDARY_EXEC_RDTSCP |
+                       SECONDARY_EXEC_ENABLE_INVPCID;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -2764,7 +2768,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
        return kvm->arch.tss_addr;
 }
 
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void save_rmode_seg(int seg, struct kvm_save_segment *save)
 {
        struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 
@@ -2772,6 +2776,12 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
        save->base = vmcs_readl(sf->base);
        save->limit = vmcs_read32(sf->limit);
        save->ar = vmcs_read32(sf->ar_bytes);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+       struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
        vmcs_write16(sf->selector, save->base >> 4);
        vmcs_write32(sf->base, save->base & 0xffff0);
        vmcs_write32(sf->limit, 0xffff);
@@ -2794,6 +2804,12 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        vmx->emulation_required = 1;
        vmx->rmode.vm86_active = 1;
 
+       save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr);
+       save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+       save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+       save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+       save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+
        /*
         * Very old userspace does not call KVM_SET_TSS_ADDR before entering
         * vcpu. Call it here with phys address pointing 16M below 4G.
@@ -2808,14 +2824,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
        vmx_segment_cache_clear(vmx);
 
-       vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
-       vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
        vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
-       vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
-       vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
        flags = vmcs_readl(GUEST_RFLAGS);
@@ -3249,7 +3259,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         * qemu binaries.
         *   IA32 arch specifies that at the time of processor reset the
         * "Accessed" bit in the AR field of segment registers is 1. And qemu
-        * is setting it to 0 in the usedland code. This causes invalid guest
+        * is setting it to 0 in the userland code. This causes invalid guest
         * state vmexit when "unrestricted guest" mode is turned on.
         *    Fix for this setup issue in cpu_reset is being pushed in the qemu
         * tree. Newer qemu binaries with that qemu fix would not need this
@@ -3800,6 +3810,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
        if (!enable_ept) {
                exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
                enable_unrestricted_guest = 0;
+               /* Enable INVPCID for non-ept guests may cause performance regression. */
+               exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
        }
        if (!enable_unrestricted_guest)
                exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4432,7 +4444,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
 
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
        if (to_vmx(vcpu)->nested.vmxon &&
@@ -4838,6 +4850,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        gpa_t gpa;
+       u32 error_code;
        int gla_validity;
 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4862,7 +4875,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
-       return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+       /* It is a write fault? */
+       error_code = exit_qualification & (1U << 1);
+       /* ept page table is present? */
+       error_code |= (exit_qualification >> 3) & 0x1;
+
+       return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
 static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4977,15 +4996,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
        int ret = 1;
        u32 cpu_exec_ctrl;
        bool intr_window_requested;
+       unsigned count = 130;
 
        cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 
-       while (!guest_state_valid(vcpu)) {
-               if (intr_window_requested
-                   && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+       while (!guest_state_valid(vcpu) && count-- != 0) {
+               if (intr_window_requested && vmx_interrupt_allowed(vcpu))
                        return handle_interrupt_window(&vmx->vcpu);
 
+               if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+                       return 1;
+
                err = emulate_instruction(vcpu, 0);
 
                if (err == EMULATE_DO_MMIO) {
@@ -4993,8 +5015,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (err != EMULATE_DONE)
+               if (err != EMULATE_DONE) {
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
                        return 0;
+               }
 
                if (signal_pending(current))
                        goto out;
@@ -5002,7 +5028,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        schedule();
        }
 
-       vmx->emulation_required = 0;
+       vmx->emulation_required = !guest_state_valid(vcpu);
 out:
        return ret;
 }
@@ -6202,6 +6228,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long debugctlmsr;
 
        if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -6241,6 +6268,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmx_set_interrupt_shadow(vcpu, 0);
 
        atomic_switch_perf_msrs(vmx);
+       debugctlmsr = get_debugctlmsr();
 
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
@@ -6342,6 +6370,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
              );
 
+       /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+       if (debugctlmsr)
+               update_debugctlmsr(debugctlmsr);
+
+#ifndef CONFIG_X86_64
+       /*
+        * The sysexit path does not restore ds/es, so we must set them to
+        * a reasonable value ourselves.
+        *
+        * We can't defer this to vmx_load_host_state() since that function
+        * may be executed in interrupt context, which saves and restore segments
+        * around it, nullifying its effect.
+        */
+       loadsegment(ds, __USER_DS);
+       loadsegment(es, __USER_DS);
+#endif
+
        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
                                  | (1 << VCPU_EXREG_RFLAGS)
                                  | (1 << VCPU_EXREG_CPL)
@@ -6536,6 +6581,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                        }
                }
        }
+
+       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+       /* Exposing INVPCID only when PCID is exposed */
+       best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+       if (vmx_invpcid_supported() &&
+           best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+           guest_cpuid_has_pcid(vcpu)) {
+               exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+                            exec_control);
+       } else {
+               exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+                            exec_control);
+               if (best)
+                       best->ecx &= ~bit(X86_FEATURE_INVPCID);
+       }
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7270,6 +7332,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .cpuid_update = vmx_cpuid_update,
 
        .rdtscp_supported = vmx_rdtscp_supported,
+       .invpcid_supported = vmx_invpcid_supported,
 
        .set_supported_cpuid = vmx_set_supported_cpuid,