#include "irq.h"
#include "mmu.h"
+#include "cpuid.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
#include <linux/ftrace_event.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <asm/mce.h>
#include <asm/i387.h>
#include <asm/xcr.h>
+#include <asm/perf_event.h>
#include "trace.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-static int __read_mostly enable_vpid = 1;
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
+static bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
-static int __read_mostly flexpriority_enabled = 1;
+static bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-static int __read_mostly enable_ept = 1;
+static bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
-static int __read_mostly enable_unrestricted_guest = 1;
+static bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
-static int __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-static int __read_mostly vmm_exclusive = 1;
+static bool __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
-static int __read_mostly yield_on_hlt = 1;
-module_param(yield_on_hlt, bool, S_IRUGO);
-
-static int __read_mostly fasteoi = 1;
+static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
/*
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
-static int __read_mostly nested = 0;
+static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
int gs_ldt_reload_needed;
int fs_reload_needed;
} host_state;
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
return NULL;
- }
+
return page;
}
static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
SECONDARY_EXEC_RDTSCP;
}
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
if (m->guest[i].index == msr)
break;
- if (i == m->nr) {
+ if (i == NR_AUTOLOAD_MSRS) {
+ printk_once(KERN_WARNING"Not enough mst switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ } else if (i == m->nr) {
++m->nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
}
#ifdef CONFIG_X86_64
+ savesegment(ds, vmx->host_state.ds_sel);
+ savesegment(es, vmx->host_state.es_sel);
+#endif
+
+#ifdef CONFIG_X86_64
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
}
if (vmx->host_state.fs_reload_needed)
loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
+ loadsegment(ds, vmx->host_state.ds_sel);
+ loadsegment(es, vmx->host_state.es_sel);
+ }
+#endif
reload_tss();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
- if (current_thread_info()->status & TS_USEDFPU)
+ if (user_has_fpu())
clts();
load_gdt(&__get_cpu_var(host_gdt));
}
vmx_set_interrupt_shadow(vcpu, 0);
}
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
- /* Ensure that we clear the HLT state in the VMCS. We don't need to
- * explicitly skip the instruction because if the HLT state is set, then
- * the instruction is already executing and RIP has already been
- * advanced. */
- if (!yield_on_hlt &&
- vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
- vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (!(vmcs12->exception_bitmap & PF_VECTOR))
+ if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
return 0;
nested_vmx_vmexit(vcpu);
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
- vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
return cpu_has_vmx_rdtscp();
}
+static bool vmx_invpcid_supported(void)
+{
+ return cpu_has_vmx_invpcid() && enable_ept;
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
int save_nmsrs, index;
unsigned long *msr_bitmap;
- vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
}
/*
- * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
- * ioctl. In this case the call-back should update internal vmx state to make
- * the changes effective.
+ * Engage any workarounds for mis-matched TSC rates. Currently limited to
+ * software catchup for faster rates on slower CPUs.
*/
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
- /* Nothing to do here */
+ if (!scale)
+ return;
+
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ } else
+ WARN(1, "user requested TSC rate below hardware speed\n");
}
/*
}
}
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
+static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
u64 offset = vmcs_read64(TSC_OFFSET);
vmcs_write64(TSC_OFFSET, offset + adjustment);
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
return 1;
/* Otherwise falls through */
default:
- vmx_load_host_state(to_vmx(vcpu));
if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
return 0;
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
- vmx_load_host_state(to_vmx(vcpu));
data = msr->data;
break;
}
switch (msr_index) {
case MSR_EFER:
- vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
break;
#ifdef CONFIG_X86_64
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
- vmx_load_host_state(vmx);
msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ preempt_disable();
+ kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
+ preempt_enable();
+ }
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
&_pin_based_exec_control) < 0)
return -EIO;
- min =
+ min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING;
-
- if (yield_on_hlt)
- min |= CPU_BASED_HLT_EXITING;
+ CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_RDPMC_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_INVPCID;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
!cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
+ enable_ept_ad_bits = 0;
}
+ if (!cpu_has_vmx_ept_ad_bits())
+ enable_ept_ad_bits = 0;
+
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
{
if (!kvm->arch.tss_addr) {
struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
gfn_t base_gfn;
slots = kvm_memslots(kvm);
- base_gfn = slots->memslots[0].base_gfn +
- kvm->memslots->memslots[0].npages - 3;
+ slot = id_to_memslot(slots, 0);
+ base_gfn = slot->base_gfn + slot->npages - 3;
+
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
}
-static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+static void save_rmode_seg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
save->base = vmcs_readl(sf->base);
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+{
+ struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_segment var;
if (enable_unrestricted_guest)
return;
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
+ save_rmode_seg(VCPU_SREG_TR, &vmx->rmode.tr);
+ save_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ save_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ save_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ save_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
vmx_segment_cache_clear(vmx);
- vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
- vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
if (emulate_invalid_guest_state)
goto continue_rmode;
- vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
continue_rmode:
kvm_mmu_reset_context(vcpu);
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+ if (enable_ept_ad_bits)
+ eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
return eptp;
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
+ * fail; use the cache instead.
+ */
+ if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
+ return vmx->cpl;
+ }
+
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ vmx->cpl = __vmx_get_cpl(vcpu);
}
- return to_vmx(vcpu)->cpl;
+
+ return vmx->cpl;
}
{
u32 ar;
- if (var->unusable)
+ if (var->unusable || !var->present)
ar = 1 << 16;
else {
ar = var->type & 15;
ar |= (var->db & 1) << 14;
ar |= (var->g & 1) << 15;
}
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
return ar;
}
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
vmcs_write32(sf->ar_bytes, ar);
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Fix segments for real mode guest in hosts that don't have
+ * "unrestricted_mode" or it was disabled.
+ * This is done to allow migration of the guests from hosts with
+ * unrestricted guest like Westmere to older host that don't have
+ * unrestricted guest like Nehelem.
+ */
+ if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+ switch (seg) {
+ case VCPU_SREG_CS:
+ vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+ vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+ if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+ vmcs_writel(GUEST_CS_BASE, 0xf0000);
+ vmcs_write16(GUEST_CS_SELECTOR,
+ vmcs_readl(GUEST_CS_BASE) >> 4);
+ break;
+ case VCPU_SREG_ES:
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ break;
+ case VCPU_SREG_DS:
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ break;
+ case VCPU_SREG_GS:
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ break;
+ case VCPU_SREG_FS:
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ break;
+ case VCPU_SREG_SS:
+ vmcs_write16(GUEST_SS_SELECTOR,
+ vmcs_readl(GUEST_SS_BASE) >> 4);
+ vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+ vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+ break;
+ }
+ }
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * __vmx_load_host_state(), in case userspace uses the null selectors
+ * too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
enable_unrestricted_guest = 0;
+ /* Enable INVPCID for non-ept guests may cause performance regression. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
vmx_set_cr4(&vmx->vcpu, 0);
vmx_set_efer(&vmx->vcpu, 0);
vmx_fpu_activate(&vmx->vcpu);
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
- /* We can get here when nested_run_pending caused
- * vmx_interrupt_allowed() to return false. In this case, do
- * nothing - the interrupt will be injected later.
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+ /*
+ * We get here if vmx_interrupt_allowed() said we can't
+ * inject to L1 now because L2 must run. Ask L2 to exit
+ * right after entry, so we can inject to L1 more promptly.
*/
+ kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
- vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
- vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
- struct vmcs12 *vmcs12;
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (to_vmx(vcpu)->nested.nested_run_pending ||
+ (vmcs12->idt_vectoring_info_field &
+ VECTORING_INFO_VALID_MASK))
return 0;
nested_vmx_vmexit(vcpu);
- vmcs12 = get_vmcs12(vcpu);
vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
vmcs12->vm_exit_intr_info = 0;
/* fall through to normal code, but now in L1, not L2 */
hypercall[2] = 0xc1;
}
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
break;
}
vcpu->run->exit_reason = 0;
- pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
(int)(exit_qualification >> 4) & 3, cr);
return 0;
}
return 1;
}
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ err = kvm_rdpmc(vcpu);
+ kvm_complete_insn_gp(vcpu, err);
+
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
bool has_error_code = false;
u32 error_code = 0;
u16 tss_selector;
- int reason, type, idt_v;
+ int reason, type, idt_v, idt_index;
idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
type != INTR_TYPE_NMI_INTR))
skip_emulated_instruction(vcpu);
- if (kvm_task_switch(vcpu, tss_selector, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
+ if (kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
{
unsigned long exit_qualification;
gpa_t gpa;
+ u32 error_code;
int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+ /* It is a write fault? */
+ error_code = exit_qualification & (1U << 1);
+ /* ept page table is present? */
+ error_code |= (exit_qualification >> 3) & 0x1;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
int ret = 1;
u32 cpu_exec_ctrl;
bool intr_window_requested;
+ unsigned count = 130;
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu)) {
- if (intr_window_requested
- && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+ while (!guest_state_valid(vcpu) && count-- != 0) {
+ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
+ if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+ return 1;
+
err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
goto out;
}
- if (err != EMULATE_DONE)
+ if (err != EMULATE_DONE) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
+ }
if (signal_pending(current))
goto out;
schedule();
}
- vmx->emulation_required = 0;
+ vmx->emulation_required = !guest_state_valid(vcpu);
out:
return ret;
}
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = handle_rdpmc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+
+ msrs = perf_guest_get_msrs(&nr_msrs);
+
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host);
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long debugctlmsr;
if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ atomic_switch_perf_msrs(vmx);
+ debugctlmsr = get_debugctlmsr();
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
#endif
);
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (debugctlmsr)
+ update_debugctlmsr(debugctlmsr);
+
+#ifndef CONFIG_X86_64
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ *
+ * We can't defer this to vmx_load_host_state() since that function
+ * may be executed in interrupt context, which saves and restore segments
+ * around it, nullifying its effect.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_CPL)
}
}
- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
return &vmx->vcpu;
free_vmcs:
- free_vmcs(vmx->loaded_vmcs->vmcs);
+ free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
}
}
}
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ /* Exposing INVPCID only when PCID is exposed */
+ best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ if (vmx_invpcid_supported() &&
+ best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+ guest_cpuid_has_pcid(vcpu)) {
+ exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ } else {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ if (best)
+ best->ecx &= ~bit(X86_FEATURE_INVPCID);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
.cpuid_update = vmx_cpuid_update,
.rdtscp_supported = vmx_rdtscp_supported,
+ .invpcid_supported = vmx_invpcid_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
if (!vmx_io_bitmap_a)
return -ENOMEM;
+ r = -ENOMEM;
+
vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b) {
- r = -ENOMEM;
+ if (!vmx_io_bitmap_b)
goto out;
- }
vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy) {
- r = -ENOMEM;
+ if (!vmx_msr_bitmap_legacy)
goto out1;
- }
+
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode) {
- r = -ENOMEM;
+ if (!vmx_msr_bitmap_longmode)
goto out2;
- }
+
/*
* Allow direct access to the PC debug port (it is often used for I/O
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
+ kvm_mmu_set_mask_ptes(0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else