KVM: Xen PV-on-HVM guest support
[linux-2.6.git] / arch / x86 / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Amit Shah    <amit.shah@qumranet.com>
14  *   Ben-Ami Yassour <benami@il.ibm.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20
21 #include <linux/kvm_host.h>
22 #include "irq.h"
23 #include "mmu.h"
24 #include "i8254.h"
25 #include "tss.h"
26 #include "kvm_cache_regs.h"
27 #include "x86.h"
28
29 #include <linux/clocksource.h>
30 #include <linux/interrupt.h>
31 #include <linux/kvm.h>
32 #include <linux/fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/module.h>
35 #include <linux/mman.h>
36 #include <linux/highmem.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/cpufreq.h>
40 #include <trace/events/kvm.h>
41 #undef TRACE_INCLUDE_FILE
42 #define CREATE_TRACE_POINTS
43 #include "trace.h"
44
45 #include <asm/uaccess.h>
46 #include <asm/msr.h>
47 #include <asm/desc.h>
48 #include <asm/mtrr.h>
49 #include <asm/mce.h>
50
51 #define MAX_IO_MSRS 256
52 #define CR0_RESERVED_BITS                                               \
53         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
54                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
55                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
56 #define CR4_RESERVED_BITS                                               \
57         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
58                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
59                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
60                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
61
62 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
63
64 #define KVM_MAX_MCE_BANKS 32
65 #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
66
67 /* EFER defaults:
68  * - enable syscall per default because its emulated by KVM
69  * - enable LME and LMA per default on 64 bit KVM
70  */
71 #ifdef CONFIG_X86_64
72 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
73 #else
74 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
75 #endif
76
77 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
78 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
79
80 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
81 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
82                                     struct kvm_cpuid_entry2 __user *entries);
83
84 struct kvm_x86_ops *kvm_x86_ops;
85 EXPORT_SYMBOL_GPL(kvm_x86_ops);
86
87 int ignore_msrs = 0;
88 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89
90 struct kvm_stats_debugfs_item debugfs_entries[] = {
91         { "pf_fixed", VCPU_STAT(pf_fixed) },
92         { "pf_guest", VCPU_STAT(pf_guest) },
93         { "tlb_flush", VCPU_STAT(tlb_flush) },
94         { "invlpg", VCPU_STAT(invlpg) },
95         { "exits", VCPU_STAT(exits) },
96         { "io_exits", VCPU_STAT(io_exits) },
97         { "mmio_exits", VCPU_STAT(mmio_exits) },
98         { "signal_exits", VCPU_STAT(signal_exits) },
99         { "irq_window", VCPU_STAT(irq_window_exits) },
100         { "nmi_window", VCPU_STAT(nmi_window_exits) },
101         { "halt_exits", VCPU_STAT(halt_exits) },
102         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
103         { "hypercalls", VCPU_STAT(hypercalls) },
104         { "request_irq", VCPU_STAT(request_irq_exits) },
105         { "irq_exits", VCPU_STAT(irq_exits) },
106         { "host_state_reload", VCPU_STAT(host_state_reload) },
107         { "efer_reload", VCPU_STAT(efer_reload) },
108         { "fpu_reload", VCPU_STAT(fpu_reload) },
109         { "insn_emulation", VCPU_STAT(insn_emulation) },
110         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
111         { "irq_injections", VCPU_STAT(irq_injections) },
112         { "nmi_injections", VCPU_STAT(nmi_injections) },
113         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
114         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
115         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
116         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
117         { "mmu_flooded", VM_STAT(mmu_flooded) },
118         { "mmu_recycled", VM_STAT(mmu_recycled) },
119         { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
120         { "mmu_unsync", VM_STAT(mmu_unsync) },
121         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
122         { "largepages", VM_STAT(lpages) },
123         { NULL }
124 };
125
126 unsigned long segment_base(u16 selector)
127 {
128         struct descriptor_table gdt;
129         struct desc_struct *d;
130         unsigned long table_base;
131         unsigned long v;
132
133         if (selector == 0)
134                 return 0;
135
136         kvm_get_gdt(&gdt);
137         table_base = gdt.base;
138
139         if (selector & 4) {           /* from ldt */
140                 u16 ldt_selector = kvm_read_ldt();
141
142                 table_base = segment_base(ldt_selector);
143         }
144         d = (struct desc_struct *)(table_base + (selector & ~7));
145         v = get_desc_base(d);
146 #ifdef CONFIG_X86_64
147         if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
148                 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
149 #endif
150         return v;
151 }
152 EXPORT_SYMBOL_GPL(segment_base);
153
154 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
155 {
156         if (irqchip_in_kernel(vcpu->kvm))
157                 return vcpu->arch.apic_base;
158         else
159                 return vcpu->arch.apic_base;
160 }
161 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
162
163 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
164 {
165         /* TODO: reserve bits check */
166         if (irqchip_in_kernel(vcpu->kvm))
167                 kvm_lapic_set_base(vcpu, data);
168         else
169                 vcpu->arch.apic_base = data;
170 }
171 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
172
173 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
174 {
175         WARN_ON(vcpu->arch.exception.pending);
176         vcpu->arch.exception.pending = true;
177         vcpu->arch.exception.has_error_code = false;
178         vcpu->arch.exception.nr = nr;
179 }
180 EXPORT_SYMBOL_GPL(kvm_queue_exception);
181
182 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
183                            u32 error_code)
184 {
185         ++vcpu->stat.pf_guest;
186
187         if (vcpu->arch.exception.pending) {
188                 switch(vcpu->arch.exception.nr) {
189                 case DF_VECTOR:
190                         /* triple fault -> shutdown */
191                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
192                         return;
193                 case PF_VECTOR:
194                         vcpu->arch.exception.nr = DF_VECTOR;
195                         vcpu->arch.exception.error_code = 0;
196                         return;
197                 default:
198                         /* replace previous exception with a new one in a hope
199                            that instruction re-execution will regenerate lost
200                            exception */
201                         vcpu->arch.exception.pending = false;
202                         break;
203                 }
204         }
205         vcpu->arch.cr2 = addr;
206         kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
207 }
208
209 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
210 {
211         vcpu->arch.nmi_pending = 1;
212 }
213 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
214
215 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
216 {
217         WARN_ON(vcpu->arch.exception.pending);
218         vcpu->arch.exception.pending = true;
219         vcpu->arch.exception.has_error_code = true;
220         vcpu->arch.exception.nr = nr;
221         vcpu->arch.exception.error_code = error_code;
222 }
223 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
224
225 /*
226  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
227  * a #GP and return false.
228  */
229 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
230 {
231         if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
232                 return true;
233         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
234         return false;
235 }
236 EXPORT_SYMBOL_GPL(kvm_require_cpl);
237
238 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
239 {
240         unsigned long rflags;
241
242         rflags = kvm_x86_ops->get_rflags(vcpu);
243         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
244                 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
245         return rflags;
246 }
247 EXPORT_SYMBOL_GPL(kvm_get_rflags);
248
249 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
250 {
251         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
252                 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
253         kvm_x86_ops->set_rflags(vcpu, rflags);
254 }
255 EXPORT_SYMBOL_GPL(kvm_set_rflags);
256
257 /*
258  * Load the pae pdptrs.  Return true is they are all valid.
259  */
260 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
261 {
262         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
263         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
264         int i;
265         int ret;
266         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
267
268         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
269                                   offset * sizeof(u64), sizeof(pdpte));
270         if (ret < 0) {
271                 ret = 0;
272                 goto out;
273         }
274         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
275                 if (is_present_gpte(pdpte[i]) &&
276                     (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
277                         ret = 0;
278                         goto out;
279                 }
280         }
281         ret = 1;
282
283         memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
284         __set_bit(VCPU_EXREG_PDPTR,
285                   (unsigned long *)&vcpu->arch.regs_avail);
286         __set_bit(VCPU_EXREG_PDPTR,
287                   (unsigned long *)&vcpu->arch.regs_dirty);
288 out:
289
290         return ret;
291 }
292 EXPORT_SYMBOL_GPL(load_pdptrs);
293
294 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
295 {
296         u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
297         bool changed = true;
298         int r;
299
300         if (is_long_mode(vcpu) || !is_pae(vcpu))
301                 return false;
302
303         if (!test_bit(VCPU_EXREG_PDPTR,
304                       (unsigned long *)&vcpu->arch.regs_avail))
305                 return true;
306
307         r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
308         if (r < 0)
309                 goto out;
310         changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
311 out:
312
313         return changed;
314 }
315
316 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
317 {
318         if (cr0 & CR0_RESERVED_BITS) {
319                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
320                        cr0, vcpu->arch.cr0);
321                 kvm_inject_gp(vcpu, 0);
322                 return;
323         }
324
325         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
326                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
327                 kvm_inject_gp(vcpu, 0);
328                 return;
329         }
330
331         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
332                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
333                        "and a clear PE flag\n");
334                 kvm_inject_gp(vcpu, 0);
335                 return;
336         }
337
338         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
339 #ifdef CONFIG_X86_64
340                 if ((vcpu->arch.shadow_efer & EFER_LME)) {
341                         int cs_db, cs_l;
342
343                         if (!is_pae(vcpu)) {
344                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
345                                        "in long mode while PAE is disabled\n");
346                                 kvm_inject_gp(vcpu, 0);
347                                 return;
348                         }
349                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
350                         if (cs_l) {
351                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
352                                        "in long mode while CS.L == 1\n");
353                                 kvm_inject_gp(vcpu, 0);
354                                 return;
355
356                         }
357                 } else
358 #endif
359                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
360                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
361                                "reserved bits\n");
362                         kvm_inject_gp(vcpu, 0);
363                         return;
364                 }
365
366         }
367
368         kvm_x86_ops->set_cr0(vcpu, cr0);
369         vcpu->arch.cr0 = cr0;
370
371         kvm_mmu_reset_context(vcpu);
372         return;
373 }
374 EXPORT_SYMBOL_GPL(kvm_set_cr0);
375
376 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
377 {
378         kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
379 }
380 EXPORT_SYMBOL_GPL(kvm_lmsw);
381
382 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
383 {
384         unsigned long old_cr4 = vcpu->arch.cr4;
385         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
386
387         if (cr4 & CR4_RESERVED_BITS) {
388                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
389                 kvm_inject_gp(vcpu, 0);
390                 return;
391         }
392
393         if (is_long_mode(vcpu)) {
394                 if (!(cr4 & X86_CR4_PAE)) {
395                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
396                                "in long mode\n");
397                         kvm_inject_gp(vcpu, 0);
398                         return;
399                 }
400         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
401                    && ((cr4 ^ old_cr4) & pdptr_bits)
402                    && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
403                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
404                 kvm_inject_gp(vcpu, 0);
405                 return;
406         }
407
408         if (cr4 & X86_CR4_VMXE) {
409                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
410                 kvm_inject_gp(vcpu, 0);
411                 return;
412         }
413         kvm_x86_ops->set_cr4(vcpu, cr4);
414         vcpu->arch.cr4 = cr4;
415         vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
416         kvm_mmu_reset_context(vcpu);
417 }
418 EXPORT_SYMBOL_GPL(kvm_set_cr4);
419
420 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
421 {
422         if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
423                 kvm_mmu_sync_roots(vcpu);
424                 kvm_mmu_flush_tlb(vcpu);
425                 return;
426         }
427
428         if (is_long_mode(vcpu)) {
429                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
430                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
431                         kvm_inject_gp(vcpu, 0);
432                         return;
433                 }
434         } else {
435                 if (is_pae(vcpu)) {
436                         if (cr3 & CR3_PAE_RESERVED_BITS) {
437                                 printk(KERN_DEBUG
438                                        "set_cr3: #GP, reserved bits\n");
439                                 kvm_inject_gp(vcpu, 0);
440                                 return;
441                         }
442                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
443                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
444                                        "reserved bits\n");
445                                 kvm_inject_gp(vcpu, 0);
446                                 return;
447                         }
448                 }
449                 /*
450                  * We don't check reserved bits in nonpae mode, because
451                  * this isn't enforced, and VMware depends on this.
452                  */
453         }
454
455         /*
456          * Does the new cr3 value map to physical memory? (Note, we
457          * catch an invalid cr3 even in real-mode, because it would
458          * cause trouble later on when we turn on paging anyway.)
459          *
460          * A real CPU would silently accept an invalid cr3 and would
461          * attempt to use it - with largely undefined (and often hard
462          * to debug) behavior on the guest side.
463          */
464         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
465                 kvm_inject_gp(vcpu, 0);
466         else {
467                 vcpu->arch.cr3 = cr3;
468                 vcpu->arch.mmu.new_cr3(vcpu);
469         }
470 }
471 EXPORT_SYMBOL_GPL(kvm_set_cr3);
472
473 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
474 {
475         if (cr8 & CR8_RESERVED_BITS) {
476                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
477                 kvm_inject_gp(vcpu, 0);
478                 return;
479         }
480         if (irqchip_in_kernel(vcpu->kvm))
481                 kvm_lapic_set_tpr(vcpu, cr8);
482         else
483                 vcpu->arch.cr8 = cr8;
484 }
485 EXPORT_SYMBOL_GPL(kvm_set_cr8);
486
487 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
488 {
489         if (irqchip_in_kernel(vcpu->kvm))
490                 return kvm_lapic_get_cr8(vcpu);
491         else
492                 return vcpu->arch.cr8;
493 }
494 EXPORT_SYMBOL_GPL(kvm_get_cr8);
495
496 static inline u32 bit(int bitno)
497 {
498         return 1 << (bitno & 31);
499 }
500
501 /*
502  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
503  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
504  *
505  * This list is modified at module load time to reflect the
506  * capabilities of the host cpu. This capabilities test skips MSRs that are
507  * kvm-specific. Those are put in the beginning of the list.
508  */
509
510 #define KVM_SAVE_MSRS_BEGIN     2
511 static u32 msrs_to_save[] = {
512         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
513         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
514         MSR_K6_STAR,
515 #ifdef CONFIG_X86_64
516         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
517 #endif
518         MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
519 };
520
521 static unsigned num_msrs_to_save;
522
523 static u32 emulated_msrs[] = {
524         MSR_IA32_MISC_ENABLE,
525 };
526
527 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
528 {
529         if (efer & efer_reserved_bits) {
530                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
531                        efer);
532                 kvm_inject_gp(vcpu, 0);
533                 return;
534         }
535
536         if (is_paging(vcpu)
537             && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
538                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
539                 kvm_inject_gp(vcpu, 0);
540                 return;
541         }
542
543         if (efer & EFER_FFXSR) {
544                 struct kvm_cpuid_entry2 *feat;
545
546                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
547                 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
548                         printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
549                         kvm_inject_gp(vcpu, 0);
550                         return;
551                 }
552         }
553
554         if (efer & EFER_SVME) {
555                 struct kvm_cpuid_entry2 *feat;
556
557                 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
558                 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
559                         printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
560                         kvm_inject_gp(vcpu, 0);
561                         return;
562                 }
563         }
564
565         kvm_x86_ops->set_efer(vcpu, efer);
566
567         efer &= ~EFER_LMA;
568         efer |= vcpu->arch.shadow_efer & EFER_LMA;
569
570         vcpu->arch.shadow_efer = efer;
571
572         vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
573         kvm_mmu_reset_context(vcpu);
574 }
575
576 void kvm_enable_efer_bits(u64 mask)
577 {
578        efer_reserved_bits &= ~mask;
579 }
580 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
581
582
583 /*
584  * Writes msr value into into the appropriate "register".
585  * Returns 0 on success, non-0 otherwise.
586  * Assumes vcpu_load() was already called.
587  */
588 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
589 {
590         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
591 }
592
593 /*
594  * Adapt set_msr() to msr_io()'s calling convention
595  */
596 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
597 {
598         return kvm_set_msr(vcpu, index, *data);
599 }
600
601 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
602 {
603         static int version;
604         struct pvclock_wall_clock wc;
605         struct timespec now, sys, boot;
606
607         if (!wall_clock)
608                 return;
609
610         version++;
611
612         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
613
614         /*
615          * The guest calculates current wall clock time by adding
616          * system time (updated by kvm_write_guest_time below) to the
617          * wall clock specified here.  guest system time equals host
618          * system time for us, thus we must fill in host boot time here.
619          */
620         now = current_kernel_time();
621         ktime_get_ts(&sys);
622         boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
623
624         wc.sec = boot.tv_sec;
625         wc.nsec = boot.tv_nsec;
626         wc.version = version;
627
628         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
629
630         version++;
631         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
632 }
633
634 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
635 {
636         uint32_t quotient, remainder;
637
638         /* Don't try to replace with do_div(), this one calculates
639          * "(dividend << 32) / divisor" */
640         __asm__ ( "divl %4"
641                   : "=a" (quotient), "=d" (remainder)
642                   : "0" (0), "1" (dividend), "r" (divisor) );
643         return quotient;
644 }
645
646 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
647 {
648         uint64_t nsecs = 1000000000LL;
649         int32_t  shift = 0;
650         uint64_t tps64;
651         uint32_t tps32;
652
653         tps64 = tsc_khz * 1000LL;
654         while (tps64 > nsecs*2) {
655                 tps64 >>= 1;
656                 shift--;
657         }
658
659         tps32 = (uint32_t)tps64;
660         while (tps32 <= (uint32_t)nsecs) {
661                 tps32 <<= 1;
662                 shift++;
663         }
664
665         hv_clock->tsc_shift = shift;
666         hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
667
668         pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
669                  __func__, tsc_khz, hv_clock->tsc_shift,
670                  hv_clock->tsc_to_system_mul);
671 }
672
673 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
674
675 static void kvm_write_guest_time(struct kvm_vcpu *v)
676 {
677         struct timespec ts;
678         unsigned long flags;
679         struct kvm_vcpu_arch *vcpu = &v->arch;
680         void *shared_kaddr;
681         unsigned long this_tsc_khz;
682
683         if ((!vcpu->time_page))
684                 return;
685
686         this_tsc_khz = get_cpu_var(cpu_tsc_khz);
687         if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
688                 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
689                 vcpu->hv_clock_tsc_khz = this_tsc_khz;
690         }
691         put_cpu_var(cpu_tsc_khz);
692
693         /* Keep irq disabled to prevent changes to the clock */
694         local_irq_save(flags);
695         kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
696         ktime_get_ts(&ts);
697         local_irq_restore(flags);
698
699         /* With all the info we got, fill in the values */
700
701         vcpu->hv_clock.system_time = ts.tv_nsec +
702                                      (NSEC_PER_SEC * (u64)ts.tv_sec);
703         /*
704          * The interface expects us to write an even number signaling that the
705          * update is finished. Since the guest won't see the intermediate
706          * state, we just increase by 2 at the end.
707          */
708         vcpu->hv_clock.version += 2;
709
710         shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
711
712         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
713                sizeof(vcpu->hv_clock));
714
715         kunmap_atomic(shared_kaddr, KM_USER0);
716
717         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
718 }
719
720 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
721 {
722         struct kvm_vcpu_arch *vcpu = &v->arch;
723
724         if (!vcpu->time_page)
725                 return 0;
726         set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
727         return 1;
728 }
729
730 static bool msr_mtrr_valid(unsigned msr)
731 {
732         switch (msr) {
733         case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
734         case MSR_MTRRfix64K_00000:
735         case MSR_MTRRfix16K_80000:
736         case MSR_MTRRfix16K_A0000:
737         case MSR_MTRRfix4K_C0000:
738         case MSR_MTRRfix4K_C8000:
739         case MSR_MTRRfix4K_D0000:
740         case MSR_MTRRfix4K_D8000:
741         case MSR_MTRRfix4K_E0000:
742         case MSR_MTRRfix4K_E8000:
743         case MSR_MTRRfix4K_F0000:
744         case MSR_MTRRfix4K_F8000:
745         case MSR_MTRRdefType:
746         case MSR_IA32_CR_PAT:
747                 return true;
748         case 0x2f8:
749                 return true;
750         }
751         return false;
752 }
753
754 static bool valid_pat_type(unsigned t)
755 {
756         return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
757 }
758
759 static bool valid_mtrr_type(unsigned t)
760 {
761         return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
762 }
763
764 static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
765 {
766         int i;
767
768         if (!msr_mtrr_valid(msr))
769                 return false;
770
771         if (msr == MSR_IA32_CR_PAT) {
772                 for (i = 0; i < 8; i++)
773                         if (!valid_pat_type((data >> (i * 8)) & 0xff))
774                                 return false;
775                 return true;
776         } else if (msr == MSR_MTRRdefType) {
777                 if (data & ~0xcff)
778                         return false;
779                 return valid_mtrr_type(data & 0xff);
780         } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
781                 for (i = 0; i < 8 ; i++)
782                         if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
783                                 return false;
784                 return true;
785         }
786
787         /* variable MTRRs */
788         return valid_mtrr_type(data & 0xff);
789 }
790
791 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
792 {
793         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
794
795         if (!mtrr_valid(vcpu, msr, data))
796                 return 1;
797
798         if (msr == MSR_MTRRdefType) {
799                 vcpu->arch.mtrr_state.def_type = data;
800                 vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
801         } else if (msr == MSR_MTRRfix64K_00000)
802                 p[0] = data;
803         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
804                 p[1 + msr - MSR_MTRRfix16K_80000] = data;
805         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
806                 p[3 + msr - MSR_MTRRfix4K_C0000] = data;
807         else if (msr == MSR_IA32_CR_PAT)
808                 vcpu->arch.pat = data;
809         else {  /* Variable MTRRs */
810                 int idx, is_mtrr_mask;
811                 u64 *pt;
812
813                 idx = (msr - 0x200) / 2;
814                 is_mtrr_mask = msr - 0x200 - 2 * idx;
815                 if (!is_mtrr_mask)
816                         pt =
817                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
818                 else
819                         pt =
820                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
821                 *pt = data;
822         }
823
824         kvm_mmu_reset_context(vcpu);
825         return 0;
826 }
827
828 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
829 {
830         u64 mcg_cap = vcpu->arch.mcg_cap;
831         unsigned bank_num = mcg_cap & 0xff;
832
833         switch (msr) {
834         case MSR_IA32_MCG_STATUS:
835                 vcpu->arch.mcg_status = data;
836                 break;
837         case MSR_IA32_MCG_CTL:
838                 if (!(mcg_cap & MCG_CTL_P))
839                         return 1;
840                 if (data != 0 && data != ~(u64)0)
841                         return -1;
842                 vcpu->arch.mcg_ctl = data;
843                 break;
844         default:
845                 if (msr >= MSR_IA32_MC0_CTL &&
846                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
847                         u32 offset = msr - MSR_IA32_MC0_CTL;
848                         /* only 0 or all 1s can be written to IA32_MCi_CTL */
849                         if ((offset & 0x3) == 0 &&
850                             data != 0 && data != ~(u64)0)
851                                 return -1;
852                         vcpu->arch.mce_banks[offset] = data;
853                         break;
854                 }
855                 return 1;
856         }
857         return 0;
858 }
859
860 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
861 {
862         struct kvm *kvm = vcpu->kvm;
863         int lm = is_long_mode(vcpu);
864         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
865                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
866         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
867                 : kvm->arch.xen_hvm_config.blob_size_32;
868         u32 page_num = data & ~PAGE_MASK;
869         u64 page_addr = data & PAGE_MASK;
870         u8 *page;
871         int r;
872
873         r = -E2BIG;
874         if (page_num >= blob_size)
875                 goto out;
876         r = -ENOMEM;
877         page = kzalloc(PAGE_SIZE, GFP_KERNEL);
878         if (!page)
879                 goto out;
880         r = -EFAULT;
881         if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
882                 goto out_free;
883         if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
884                 goto out_free;
885         r = 0;
886 out_free:
887         kfree(page);
888 out:
889         return r;
890 }
891
892 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
893 {
894         switch (msr) {
895         case MSR_EFER:
896                 set_efer(vcpu, data);
897                 break;
898         case MSR_K7_HWCR:
899                 data &= ~(u64)0x40;     /* ignore flush filter disable */
900                 if (data != 0) {
901                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
902                                 data);
903                         return 1;
904                 }
905                 break;
906         case MSR_FAM10H_MMIO_CONF_BASE:
907                 if (data != 0) {
908                         pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
909                                 "0x%llx\n", data);
910                         return 1;
911                 }
912                 break;
913         case MSR_AMD64_NB_CFG:
914                 break;
915         case MSR_IA32_DEBUGCTLMSR:
916                 if (!data) {
917                         /* We support the non-activated case already */
918                         break;
919                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
920                         /* Values other than LBR and BTF are vendor-specific,
921                            thus reserved and should throw a #GP */
922                         return 1;
923                 }
924                 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
925                         __func__, data);
926                 break;
927         case MSR_IA32_UCODE_REV:
928         case MSR_IA32_UCODE_WRITE:
929         case MSR_VM_HSAVE_PA:
930         case MSR_AMD64_PATCH_LOADER:
931                 break;
932         case 0x200 ... 0x2ff:
933                 return set_msr_mtrr(vcpu, msr, data);
934         case MSR_IA32_APICBASE:
935                 kvm_set_apic_base(vcpu, data);
936                 break;
937         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
938                 return kvm_x2apic_msr_write(vcpu, msr, data);
939         case MSR_IA32_MISC_ENABLE:
940                 vcpu->arch.ia32_misc_enable_msr = data;
941                 break;
942         case MSR_KVM_WALL_CLOCK:
943                 vcpu->kvm->arch.wall_clock = data;
944                 kvm_write_wall_clock(vcpu->kvm, data);
945                 break;
946         case MSR_KVM_SYSTEM_TIME: {
947                 if (vcpu->arch.time_page) {
948                         kvm_release_page_dirty(vcpu->arch.time_page);
949                         vcpu->arch.time_page = NULL;
950                 }
951
952                 vcpu->arch.time = data;
953
954                 /* we verify if the enable bit is set... */
955                 if (!(data & 1))
956                         break;
957
958                 /* ...but clean it before doing the actual write */
959                 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
960
961                 vcpu->arch.time_page =
962                                 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
963
964                 if (is_error_page(vcpu->arch.time_page)) {
965                         kvm_release_page_clean(vcpu->arch.time_page);
966                         vcpu->arch.time_page = NULL;
967                 }
968
969                 kvm_request_guest_time_update(vcpu);
970                 break;
971         }
972         case MSR_IA32_MCG_CTL:
973         case MSR_IA32_MCG_STATUS:
974         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
975                 return set_msr_mce(vcpu, msr, data);
976
977         /* Performance counters are not protected by a CPUID bit,
978          * so we should check all of them in the generic path for the sake of
979          * cross vendor migration.
980          * Writing a zero into the event select MSRs disables them,
981          * which we perfectly emulate ;-). Any other value should be at least
982          * reported, some guests depend on them.
983          */
984         case MSR_P6_EVNTSEL0:
985         case MSR_P6_EVNTSEL1:
986         case MSR_K7_EVNTSEL0:
987         case MSR_K7_EVNTSEL1:
988         case MSR_K7_EVNTSEL2:
989         case MSR_K7_EVNTSEL3:
990                 if (data != 0)
991                         pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
992                                 "0x%x data 0x%llx\n", msr, data);
993                 break;
994         /* at least RHEL 4 unconditionally writes to the perfctr registers,
995          * so we ignore writes to make it happy.
996          */
997         case MSR_P6_PERFCTR0:
998         case MSR_P6_PERFCTR1:
999         case MSR_K7_PERFCTR0:
1000         case MSR_K7_PERFCTR1:
1001         case MSR_K7_PERFCTR2:
1002         case MSR_K7_PERFCTR3:
1003                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1004                         "0x%x data 0x%llx\n", msr, data);
1005                 break;
1006         default:
1007                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1008                         return xen_hvm_config(vcpu, data);
1009                 if (!ignore_msrs) {
1010                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1011                                 msr, data);
1012                         return 1;
1013                 } else {
1014                         pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1015                                 msr, data);
1016                         break;
1017                 }
1018         }
1019         return 0;
1020 }
1021 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1022
1023
1024 /*
1025  * Reads an msr value (of 'msr_index') into 'pdata'.
1026  * Returns 0 on success, non-0 otherwise.
1027  * Assumes vcpu_load() was already called.
1028  */
1029 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1030 {
1031         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1032 }
1033
1034 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1035 {
1036         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1037
1038         if (!msr_mtrr_valid(msr))
1039                 return 1;
1040
1041         if (msr == MSR_MTRRdefType)
1042                 *pdata = vcpu->arch.mtrr_state.def_type +
1043                          (vcpu->arch.mtrr_state.enabled << 10);
1044         else if (msr == MSR_MTRRfix64K_00000)
1045                 *pdata = p[0];
1046         else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1047                 *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1048         else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1049                 *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1050         else if (msr == MSR_IA32_CR_PAT)
1051                 *pdata = vcpu->arch.pat;
1052         else {  /* Variable MTRRs */
1053                 int idx, is_mtrr_mask;
1054                 u64 *pt;
1055
1056                 idx = (msr - 0x200) / 2;
1057                 is_mtrr_mask = msr - 0x200 - 2 * idx;
1058                 if (!is_mtrr_mask)
1059                         pt =
1060                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1061                 else
1062                         pt =
1063                           (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1064                 *pdata = *pt;
1065         }
1066
1067         return 0;
1068 }
1069
1070 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1071 {
1072         u64 data;
1073         u64 mcg_cap = vcpu->arch.mcg_cap;
1074         unsigned bank_num = mcg_cap & 0xff;
1075
1076         switch (msr) {
1077         case MSR_IA32_P5_MC_ADDR:
1078         case MSR_IA32_P5_MC_TYPE:
1079                 data = 0;
1080                 break;
1081         case MSR_IA32_MCG_CAP:
1082                 data = vcpu->arch.mcg_cap;
1083                 break;
1084         case MSR_IA32_MCG_CTL:
1085                 if (!(mcg_cap & MCG_CTL_P))
1086                         return 1;
1087                 data = vcpu->arch.mcg_ctl;
1088                 break;
1089         case MSR_IA32_MCG_STATUS:
1090                 data = vcpu->arch.mcg_status;
1091                 break;
1092         default:
1093                 if (msr >= MSR_IA32_MC0_CTL &&
1094                     msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1095                         u32 offset = msr - MSR_IA32_MC0_CTL;
1096                         data = vcpu->arch.mce_banks[offset];
1097                         break;
1098                 }
1099                 return 1;
1100         }
1101         *pdata = data;
1102         return 0;
1103 }
1104
1105 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1106 {
1107         u64 data;
1108
1109         switch (msr) {
1110         case MSR_IA32_PLATFORM_ID:
1111         case MSR_IA32_UCODE_REV:
1112         case MSR_IA32_EBL_CR_POWERON:
1113         case MSR_IA32_DEBUGCTLMSR:
1114         case MSR_IA32_LASTBRANCHFROMIP:
1115         case MSR_IA32_LASTBRANCHTOIP:
1116         case MSR_IA32_LASTINTFROMIP:
1117         case MSR_IA32_LASTINTTOIP:
1118         case MSR_K8_SYSCFG:
1119         case MSR_K7_HWCR:
1120         case MSR_VM_HSAVE_PA:
1121         case MSR_P6_PERFCTR0:
1122         case MSR_P6_PERFCTR1:
1123         case MSR_P6_EVNTSEL0:
1124         case MSR_P6_EVNTSEL1:
1125         case MSR_K7_EVNTSEL0:
1126         case MSR_K7_PERFCTR0:
1127         case MSR_K8_INT_PENDING_MSG:
1128         case MSR_AMD64_NB_CFG:
1129         case MSR_FAM10H_MMIO_CONF_BASE:
1130                 data = 0;
1131                 break;
1132         case MSR_MTRRcap:
1133                 data = 0x500 | KVM_NR_VAR_MTRR;
1134                 break;
1135         case 0x200 ... 0x2ff:
1136                 return get_msr_mtrr(vcpu, msr, pdata);
1137         case 0xcd: /* fsb frequency */
1138                 data = 3;
1139                 break;
1140         case MSR_IA32_APICBASE:
1141                 data = kvm_get_apic_base(vcpu);
1142                 break;
1143         case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1144                 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1145                 break;
1146         case MSR_IA32_MISC_ENABLE:
1147                 data = vcpu->arch.ia32_misc_enable_msr;
1148                 break;
1149         case MSR_IA32_PERF_STATUS:
1150                 /* TSC increment by tick */
1151                 data = 1000ULL;
1152                 /* CPU multiplier */
1153                 data |= (((uint64_t)4ULL) << 40);
1154                 break;
1155         case MSR_EFER:
1156                 data = vcpu->arch.shadow_efer;
1157                 break;
1158         case MSR_KVM_WALL_CLOCK:
1159                 data = vcpu->kvm->arch.wall_clock;
1160                 break;
1161         case MSR_KVM_SYSTEM_TIME:
1162                 data = vcpu->arch.time;
1163                 break;
1164         case MSR_IA32_P5_MC_ADDR:
1165         case MSR_IA32_P5_MC_TYPE:
1166         case MSR_IA32_MCG_CAP:
1167         case MSR_IA32_MCG_CTL:
1168         case MSR_IA32_MCG_STATUS:
1169         case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1170                 return get_msr_mce(vcpu, msr, pdata);
1171         default:
1172                 if (!ignore_msrs) {
1173                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1174                         return 1;
1175                 } else {
1176                         pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1177                         data = 0;
1178                 }
1179                 break;
1180         }
1181         *pdata = data;
1182         return 0;
1183 }
1184 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1185
1186 /*
1187  * Read or write a bunch of msrs. All parameters are kernel addresses.
1188  *
1189  * @return number of msrs set successfully.
1190  */
1191 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1192                     struct kvm_msr_entry *entries,
1193                     int (*do_msr)(struct kvm_vcpu *vcpu,
1194                                   unsigned index, u64 *data))
1195 {
1196         int i;
1197
1198         vcpu_load(vcpu);
1199
1200         down_read(&vcpu->kvm->slots_lock);
1201         for (i = 0; i < msrs->nmsrs; ++i)
1202                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1203                         break;
1204         up_read(&vcpu->kvm->slots_lock);
1205
1206         vcpu_put(vcpu);
1207
1208         return i;
1209 }
1210
1211 /*
1212  * Read or write a bunch of msrs. Parameters are user addresses.
1213  *
1214  * @return number of msrs set successfully.
1215  */
1216 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
1217                   int (*do_msr)(struct kvm_vcpu *vcpu,
1218                                 unsigned index, u64 *data),
1219                   int writeback)
1220 {
1221         struct kvm_msrs msrs;
1222         struct kvm_msr_entry *entries;
1223         int r, n;
1224         unsigned size;
1225
1226         r = -EFAULT;
1227         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1228                 goto out;
1229
1230         r = -E2BIG;
1231         if (msrs.nmsrs >= MAX_IO_MSRS)
1232                 goto out;
1233
1234         r = -ENOMEM;
1235         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1236         entries = vmalloc(size);
1237         if (!entries)
1238                 goto out;
1239
1240         r = -EFAULT;
1241         if (copy_from_user(entries, user_msrs->entries, size))
1242                 goto out_free;
1243
1244         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1245         if (r < 0)
1246                 goto out_free;
1247
1248         r = -EFAULT;
1249         if (writeback && copy_to_user(user_msrs->entries, entries, size))
1250                 goto out_free;
1251
1252         r = n;
1253
1254 out_free:
1255         vfree(entries);
1256 out:
1257         return r;
1258 }
1259
1260 int kvm_dev_ioctl_check_extension(long ext)
1261 {
1262         int r;
1263
1264         switch (ext) {
1265         case KVM_CAP_IRQCHIP:
1266         case KVM_CAP_HLT:
1267         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1268         case KVM_CAP_SET_TSS_ADDR:
1269         case KVM_CAP_EXT_CPUID:
1270         case KVM_CAP_CLOCKSOURCE:
1271         case KVM_CAP_PIT:
1272         case KVM_CAP_NOP_IO_DELAY:
1273         case KVM_CAP_MP_STATE:
1274         case KVM_CAP_SYNC_MMU:
1275         case KVM_CAP_REINJECT_CONTROL:
1276         case KVM_CAP_IRQ_INJECT_STATUS:
1277         case KVM_CAP_ASSIGN_DEV_IRQ:
1278         case KVM_CAP_IRQFD:
1279         case KVM_CAP_IOEVENTFD:
1280         case KVM_CAP_PIT2:
1281         case KVM_CAP_PIT_STATE2:
1282         case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1283         case KVM_CAP_XEN_HVM:
1284                 r = 1;
1285                 break;
1286         case KVM_CAP_COALESCED_MMIO:
1287                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1288                 break;
1289         case KVM_CAP_VAPIC:
1290                 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1291                 break;
1292         case KVM_CAP_NR_VCPUS:
1293                 r = KVM_MAX_VCPUS;
1294                 break;
1295         case KVM_CAP_NR_MEMSLOTS:
1296                 r = KVM_MEMORY_SLOTS;
1297                 break;
1298         case KVM_CAP_PV_MMU:    /* obsolete */
1299                 r = 0;
1300                 break;
1301         case KVM_CAP_IOMMU:
1302                 r = iommu_found();
1303                 break;
1304         case KVM_CAP_MCE:
1305                 r = KVM_MAX_MCE_BANKS;
1306                 break;
1307         default:
1308                 r = 0;
1309                 break;
1310         }
1311         return r;
1312
1313 }
1314
1315 long kvm_arch_dev_ioctl(struct file *filp,
1316                         unsigned int ioctl, unsigned long arg)
1317 {
1318         void __user *argp = (void __user *)arg;
1319         long r;
1320
1321         switch (ioctl) {
1322         case KVM_GET_MSR_INDEX_LIST: {
1323                 struct kvm_msr_list __user *user_msr_list = argp;
1324                 struct kvm_msr_list msr_list;
1325                 unsigned n;
1326
1327                 r = -EFAULT;
1328                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1329                         goto out;
1330                 n = msr_list.nmsrs;
1331                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1332                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1333                         goto out;
1334                 r = -E2BIG;
1335                 if (n < msr_list.nmsrs)
1336                         goto out;
1337                 r = -EFAULT;
1338                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1339                                  num_msrs_to_save * sizeof(u32)))
1340                         goto out;
1341                 if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
1342                                  &emulated_msrs,
1343                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1344                         goto out;
1345                 r = 0;
1346                 break;
1347         }
1348         case KVM_GET_SUPPORTED_CPUID: {
1349                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1350                 struct kvm_cpuid2 cpuid;
1351
1352                 r = -EFAULT;
1353                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1354                         goto out;
1355                 r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1356                                                       cpuid_arg->entries);
1357                 if (r)
1358                         goto out;
1359
1360                 r = -EFAULT;
1361                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1362                         goto out;
1363                 r = 0;
1364                 break;
1365         }
1366         case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1367                 u64 mce_cap;
1368
1369                 mce_cap = KVM_MCE_CAP_SUPPORTED;
1370                 r = -EFAULT;
1371                 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1372                         goto out;
1373                 r = 0;
1374                 break;
1375         }
1376         default:
1377                 r = -EINVAL;
1378         }
1379 out:
1380         return r;
1381 }
1382
1383 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1384 {
1385         kvm_x86_ops->vcpu_load(vcpu, cpu);
1386         if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1387                 unsigned long khz = cpufreq_quick_get(cpu);
1388                 if (!khz)
1389                         khz = tsc_khz;
1390                 per_cpu(cpu_tsc_khz, cpu) = khz;
1391         }
1392         kvm_request_guest_time_update(vcpu);
1393 }
1394
1395 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1396 {
1397         kvm_x86_ops->vcpu_put(vcpu);
1398         kvm_put_guest_fpu(vcpu);
1399 }
1400
1401 static int is_efer_nx(void)
1402 {
1403         unsigned long long efer = 0;
1404
1405         rdmsrl_safe(MSR_EFER, &efer);
1406         return efer & EFER_NX;
1407 }
1408
1409 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1410 {
1411         int i;
1412         struct kvm_cpuid_entry2 *e, *entry;
1413
1414         entry = NULL;
1415         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1416                 e = &vcpu->arch.cpuid_entries[i];
1417                 if (e->function == 0x80000001) {
1418                         entry = e;
1419                         break;
1420                 }
1421         }
1422         if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1423                 entry->edx &= ~(1 << 20);
1424                 printk(KERN_INFO "kvm: guest NX capability removed\n");
1425         }
1426 }
1427
1428 /* when an old userspace process fills a new kernel module */
1429 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1430                                     struct kvm_cpuid *cpuid,
1431                                     struct kvm_cpuid_entry __user *entries)
1432 {
1433         int r, i;
1434         struct kvm_cpuid_entry *cpuid_entries;
1435
1436         r = -E2BIG;
1437         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1438                 goto out;
1439         r = -ENOMEM;
1440         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1441         if (!cpuid_entries)
1442                 goto out;
1443         r = -EFAULT;
1444         if (copy_from_user(cpuid_entries, entries,
1445                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1446                 goto out_free;
1447         for (i = 0; i < cpuid->nent; i++) {
1448                 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1449                 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1450                 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1451                 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1452                 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1453                 vcpu->arch.cpuid_entries[i].index = 0;
1454                 vcpu->arch.cpuid_entries[i].flags = 0;
1455                 vcpu->arch.cpuid_entries[i].padding[0] = 0;
1456                 vcpu->arch.cpuid_entries[i].padding[1] = 0;
1457                 vcpu->arch.cpuid_entries[i].padding[2] = 0;
1458         }
1459         vcpu->arch.cpuid_nent = cpuid->nent;
1460         cpuid_fix_nx_cap(vcpu);
1461         r = 0;
1462         kvm_apic_set_version(vcpu);
1463
1464 out_free:
1465         vfree(cpuid_entries);
1466 out:
1467         return r;
1468 }
1469
1470 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1471                                      struct kvm_cpuid2 *cpuid,
1472                                      struct kvm_cpuid_entry2 __user *entries)
1473 {
1474         int r;
1475
1476         r = -E2BIG;
1477         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1478                 goto out;
1479         r = -EFAULT;
1480         if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1481                            cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1482                 goto out;
1483         vcpu->arch.cpuid_nent = cpuid->nent;
1484         kvm_apic_set_version(vcpu);
1485         return 0;
1486
1487 out:
1488         return r;
1489 }
1490
1491 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1492                                      struct kvm_cpuid2 *cpuid,
1493                                      struct kvm_cpuid_entry2 __user *entries)
1494 {
1495         int r;
1496
1497         r = -E2BIG;
1498         if (cpuid->nent < vcpu->arch.cpuid_nent)
1499                 goto out;
1500         r = -EFAULT;
1501         if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1502                          vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1503                 goto out;
1504         return 0;
1505
1506 out:
1507         cpuid->nent = vcpu->arch.cpuid_nent;
1508         return r;
1509 }
1510
1511 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1512                            u32 index)
1513 {
1514         entry->function = function;
1515         entry->index = index;
1516         cpuid_count(entry->function, entry->index,
1517                     &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1518         entry->flags = 0;
1519 }
1520
1521 #define F(x) bit(X86_FEATURE_##x)
1522
1523 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1524                          u32 index, int *nent, int maxnent)
1525 {
1526         unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1527         unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1528 #ifdef CONFIG_X86_64
1529         unsigned f_lm = F(LM);
1530 #else
1531         unsigned f_lm = 0;
1532 #endif
1533
1534         /* cpuid 1.edx */
1535         const u32 kvm_supported_word0_x86_features =
1536                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1537                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1538                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1539                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1540                 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1541                 0 /* Reserved, DS, ACPI */ | F(MMX) |
1542                 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1543                 0 /* HTT, TM, Reserved, PBE */;
1544         /* cpuid 0x80000001.edx */
1545         const u32 kvm_supported_word1_x86_features =
1546                 F(FPU) | F(VME) | F(DE) | F(PSE) |
1547                 F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1548                 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1549                 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1550                 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1551                 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1552                 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1553                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1554         /* cpuid 1.ecx */
1555         const u32 kvm_supported_word4_x86_features =
1556                 F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1557                 0 /* DS-CPL, VMX, SMX, EST */ |
1558                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1559                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1560                 0 /* Reserved, DCA */ | F(XMM4_1) |
1561                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1562                 0 /* Reserved, XSAVE, OSXSAVE */;
1563         /* cpuid 0x80000001.ecx */
1564         const u32 kvm_supported_word6_x86_features =
1565                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1566                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1567                 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1568                 0 /* SKINIT */ | 0 /* WDT */;
1569
1570         /* all calls to cpuid_count() should be made on the same cpu */
1571         get_cpu();
1572         do_cpuid_1_ent(entry, function, index);
1573         ++*nent;
1574
1575         switch (function) {
1576         case 0:
1577                 entry->eax = min(entry->eax, (u32)0xb);
1578                 break;
1579         case 1:
1580                 entry->edx &= kvm_supported_word0_x86_features;
1581                 entry->ecx &= kvm_supported_word4_x86_features;
1582                 /* we support x2apic emulation even if host does not support
1583                  * it since we emulate x2apic in software */
1584                 entry->ecx |= F(X2APIC);
1585                 break;
1586         /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1587          * may return different values. This forces us to get_cpu() before
1588          * issuing the first command, and also to emulate this annoying behavior
1589          * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1590         case 2: {
1591                 int t, times = entry->eax & 0xff;
1592
1593                 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1594                 entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1595                 for (t = 1; t < times && *nent < maxnent; ++t) {
1596                         do_cpuid_1_ent(&entry[t], function, 0);
1597                         entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1598                         ++*nent;
1599                 }
1600                 break;
1601         }
1602         /* function 4 and 0xb have additional index. */
1603         case 4: {
1604                 int i, cache_type;
1605
1606                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1607                 /* read more entries until cache_type is zero */
1608                 for (i = 1; *nent < maxnent; ++i) {
1609                         cache_type = entry[i - 1].eax & 0x1f;
1610                         if (!cache_type)
1611                                 break;
1612                         do_cpuid_1_ent(&entry[i], function, i);
1613                         entry[i].flags |=
1614                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1615                         ++*nent;
1616                 }
1617                 break;
1618         }
1619         case 0xb: {
1620                 int i, level_type;
1621
1622                 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1623                 /* read more entries until level_type is zero */
1624                 for (i = 1; *nent < maxnent; ++i) {
1625                         level_type = entry[i - 1].ecx & 0xff00;
1626                         if (!level_type)
1627                                 break;
1628                         do_cpuid_1_ent(&entry[i], function, i);
1629                         entry[i].flags |=
1630                                KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1631                         ++*nent;
1632                 }
1633                 break;
1634         }
1635         case 0x80000000:
1636                 entry->eax = min(entry->eax, 0x8000001a);
1637                 break;
1638         case 0x80000001:
1639                 entry->edx &= kvm_supported_word1_x86_features;
1640                 entry->ecx &= kvm_supported_word6_x86_features;
1641                 break;
1642         }
1643         put_cpu();
1644 }
1645
1646 #undef F
1647
1648 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1649                                      struct kvm_cpuid_entry2 __user *entries)
1650 {
1651         struct kvm_cpuid_entry2 *cpuid_entries;
1652         int limit, nent = 0, r = -E2BIG;
1653         u32 func;
1654
1655         if (cpuid->nent < 1)
1656                 goto out;
1657         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1658                 cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1659         r = -ENOMEM;
1660         cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1661         if (!cpuid_entries)
1662                 goto out;
1663
1664         do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1665         limit = cpuid_entries[0].eax;
1666         for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1667                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1668                              &nent, cpuid->nent);
1669         r = -E2BIG;
1670         if (nent >= cpuid->nent)
1671                 goto out_free;
1672
1673         do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1674         limit = cpuid_entries[nent - 1].eax;
1675         for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1676                 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1677                              &nent, cpuid->nent);
1678         r = -E2BIG;
1679         if (nent >= cpuid->nent)
1680                 goto out_free;
1681
1682         r = -EFAULT;
1683         if (copy_to_user(entries, cpuid_entries,
1684                          nent * sizeof(struct kvm_cpuid_entry2)))
1685                 goto out_free;
1686         cpuid->nent = nent;
1687         r = 0;
1688
1689 out_free:
1690         vfree(cpuid_entries);
1691 out:
1692         return r;
1693 }
1694
1695 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1696                                     struct kvm_lapic_state *s)
1697 {
1698         vcpu_load(vcpu);
1699         memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1700         vcpu_put(vcpu);
1701
1702         return 0;
1703 }
1704
1705 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1706                                     struct kvm_lapic_state *s)
1707 {
1708         vcpu_load(vcpu);
1709         memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1710         kvm_apic_post_state_restore(vcpu);
1711         update_cr8_intercept(vcpu);
1712         vcpu_put(vcpu);
1713
1714         return 0;
1715 }
1716
1717 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1718                                     struct kvm_interrupt *irq)
1719 {
1720         if (irq->irq < 0 || irq->irq >= 256)
1721                 return -EINVAL;
1722         if (irqchip_in_kernel(vcpu->kvm))
1723                 return -ENXIO;
1724         vcpu_load(vcpu);
1725
1726         kvm_queue_interrupt(vcpu, irq->irq, false);
1727
1728         vcpu_put(vcpu);
1729
1730         return 0;
1731 }
1732
1733 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1734 {
1735         vcpu_load(vcpu);
1736         kvm_inject_nmi(vcpu);
1737         vcpu_put(vcpu);
1738
1739         return 0;
1740 }
1741
1742 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1743                                            struct kvm_tpr_access_ctl *tac)
1744 {
1745         if (tac->flags)
1746                 return -EINVAL;
1747         vcpu->arch.tpr_access_reporting = !!tac->enabled;
1748         return 0;
1749 }
1750
1751 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1752                                         u64 mcg_cap)
1753 {
1754         int r;
1755         unsigned bank_num = mcg_cap & 0xff, bank;
1756
1757         r = -EINVAL;
1758         if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1759                 goto out;
1760         if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1761                 goto out;
1762         r = 0;
1763         vcpu->arch.mcg_cap = mcg_cap;
1764         /* Init IA32_MCG_CTL to all 1s */
1765         if (mcg_cap & MCG_CTL_P)
1766                 vcpu->arch.mcg_ctl = ~(u64)0;
1767         /* Init IA32_MCi_CTL to all 1s */
1768         for (bank = 0; bank < bank_num; bank++)
1769                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1770 out:
1771         return r;
1772 }
1773
1774 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1775                                       struct kvm_x86_mce *mce)
1776 {
1777         u64 mcg_cap = vcpu->arch.mcg_cap;
1778         unsigned bank_num = mcg_cap & 0xff;
1779         u64 *banks = vcpu->arch.mce_banks;
1780
1781         if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1782                 return -EINVAL;
1783         /*
1784          * if IA32_MCG_CTL is not all 1s, the uncorrected error
1785          * reporting is disabled
1786          */
1787         if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1788             vcpu->arch.mcg_ctl != ~(u64)0)
1789                 return 0;
1790         banks += 4 * mce->bank;
1791         /*
1792          * if IA32_MCi_CTL is not all 1s, the uncorrected error
1793          * reporting is disabled for the bank
1794          */
1795         if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1796                 return 0;
1797         if (mce->status & MCI_STATUS_UC) {
1798                 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1799                     !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1800                         printk(KERN_DEBUG "kvm: set_mce: "
1801                                "injects mce exception while "
1802                                "previous one is in progress!\n");
1803                         set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1804                         return 0;
1805                 }
1806                 if (banks[1] & MCI_STATUS_VAL)
1807                         mce->status |= MCI_STATUS_OVER;
1808                 banks[2] = mce->addr;
1809                 banks[3] = mce->misc;
1810                 vcpu->arch.mcg_status = mce->mcg_status;
1811                 banks[1] = mce->status;
1812                 kvm_queue_exception(vcpu, MC_VECTOR);
1813         } else if (!(banks[1] & MCI_STATUS_VAL)
1814                    || !(banks[1] & MCI_STATUS_UC)) {
1815                 if (banks[1] & MCI_STATUS_VAL)
1816                         mce->status |= MCI_STATUS_OVER;
1817                 banks[2] = mce->addr;
1818                 banks[3] = mce->misc;
1819                 banks[1] = mce->status;
1820         } else
1821                 banks[1] |= MCI_STATUS_OVER;
1822         return 0;
1823 }
1824
1825 long kvm_arch_vcpu_ioctl(struct file *filp,
1826                          unsigned int ioctl, unsigned long arg)
1827 {
1828         struct kvm_vcpu *vcpu = filp->private_data;
1829         void __user *argp = (void __user *)arg;
1830         int r;
1831         struct kvm_lapic_state *lapic = NULL;
1832
1833         switch (ioctl) {
1834         case KVM_GET_LAPIC: {
1835                 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1836
1837                 r = -ENOMEM;
1838                 if (!lapic)
1839                         goto out;
1840                 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1841                 if (r)
1842                         goto out;
1843                 r = -EFAULT;
1844                 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1845                         goto out;
1846                 r = 0;
1847                 break;
1848         }
1849         case KVM_SET_LAPIC: {
1850                 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1851                 r = -ENOMEM;
1852                 if (!lapic)
1853                         goto out;
1854                 r = -EFAULT;
1855                 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1856                         goto out;
1857                 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1858                 if (r)
1859                         goto out;
1860                 r = 0;
1861                 break;
1862         }
1863         case KVM_INTERRUPT: {
1864                 struct kvm_interrupt irq;
1865
1866                 r = -EFAULT;
1867                 if (copy_from_user(&irq, argp, sizeof irq))
1868                         goto out;
1869                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1870                 if (r)
1871                         goto out;
1872                 r = 0;
1873                 break;
1874         }
1875         case KVM_NMI: {
1876                 r = kvm_vcpu_ioctl_nmi(vcpu);
1877                 if (r)
1878                         goto out;
1879                 r = 0;
1880                 break;
1881         }
1882         case KVM_SET_CPUID: {
1883                 struct kvm_cpuid __user *cpuid_arg = argp;
1884                 struct kvm_cpuid cpuid;
1885
1886                 r = -EFAULT;
1887                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1888                         goto out;
1889                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1890                 if (r)
1891                         goto out;
1892                 break;
1893         }
1894         case KVM_SET_CPUID2: {
1895                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1896                 struct kvm_cpuid2 cpuid;
1897
1898                 r = -EFAULT;
1899                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1900                         goto out;
1901                 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1902                                               cpuid_arg->entries);
1903                 if (r)
1904                         goto out;
1905                 break;
1906         }
1907         case KVM_GET_CPUID2: {
1908                 struct kvm_cpuid2 __user *cpuid_arg = argp;
1909                 struct kvm_cpuid2 cpuid;
1910
1911                 r = -EFAULT;
1912                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1913                         goto out;
1914                 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1915                                               cpuid_arg->entries);
1916                 if (r)
1917                         goto out;
1918                 r = -EFAULT;
1919                 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1920                         goto out;
1921                 r = 0;
1922                 break;
1923         }
1924         case KVM_GET_MSRS:
1925                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1926                 break;
1927         case KVM_SET_MSRS:
1928                 r = msr_io(vcpu, argp, do_set_msr, 0);
1929                 break;
1930         case KVM_TPR_ACCESS_REPORTING: {
1931                 struct kvm_tpr_access_ctl tac;
1932
1933                 r = -EFAULT;
1934                 if (copy_from_user(&tac, argp, sizeof tac))
1935                         goto out;
1936                 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1937                 if (r)
1938                         goto out;
1939                 r = -EFAULT;
1940                 if (copy_to_user(argp, &tac, sizeof tac))
1941                         goto out;
1942                 r = 0;
1943                 break;
1944         };
1945         case KVM_SET_VAPIC_ADDR: {
1946                 struct kvm_vapic_addr va;
1947
1948                 r = -EINVAL;
1949                 if (!irqchip_in_kernel(vcpu->kvm))
1950                         goto out;
1951                 r = -EFAULT;
1952                 if (copy_from_user(&va, argp, sizeof va))
1953                         goto out;
1954                 r = 0;
1955                 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1956                 break;
1957         }
1958         case KVM_X86_SETUP_MCE: {
1959                 u64 mcg_cap;
1960
1961                 r = -EFAULT;
1962                 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
1963                         goto out;
1964                 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
1965                 break;
1966         }
1967         case KVM_X86_SET_MCE: {
1968                 struct kvm_x86_mce mce;
1969
1970                 r = -EFAULT;
1971                 if (copy_from_user(&mce, argp, sizeof mce))
1972                         goto out;
1973                 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1974                 break;
1975         }
1976         default:
1977                 r = -EINVAL;
1978         }
1979 out:
1980         kfree(lapic);
1981         return r;
1982 }
1983
1984 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1985 {
1986         int ret;
1987
1988         if (addr > (unsigned int)(-3 * PAGE_SIZE))
1989                 return -1;
1990         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1991         return ret;
1992 }
1993
1994 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
1995                                               u64 ident_addr)
1996 {
1997         kvm->arch.ept_identity_map_addr = ident_addr;
1998         return 0;
1999 }
2000
2001 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2002                                           u32 kvm_nr_mmu_pages)
2003 {
2004         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
2005                 return -EINVAL;
2006
2007         down_write(&kvm->slots_lock);
2008         spin_lock(&kvm->mmu_lock);
2009
2010         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
2011         kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
2012
2013         spin_unlock(&kvm->mmu_lock);
2014         up_write(&kvm->slots_lock);
2015         return 0;
2016 }
2017
2018 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2019 {
2020         return kvm->arch.n_alloc_mmu_pages;
2021 }
2022
2023 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2024 {
2025         int i;
2026         struct kvm_mem_alias *alias;
2027
2028         for (i = 0; i < kvm->arch.naliases; ++i) {
2029                 alias = &kvm->arch.aliases[i];
2030                 if (gfn >= alias->base_gfn
2031                     && gfn < alias->base_gfn + alias->npages)
2032                         return alias->target_gfn + gfn - alias->base_gfn;
2033         }
2034         return gfn;
2035 }
2036
2037 /*
2038  * Set a new alias region.  Aliases map a portion of physical memory into
2039  * another portion.  This is useful for memory windows, for example the PC
2040  * VGA region.
2041  */
2042 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
2043                                          struct kvm_memory_alias *alias)
2044 {
2045         int r, n;
2046         struct kvm_mem_alias *p;
2047
2048         r = -EINVAL;
2049         /* General sanity checks */
2050         if (alias->memory_size & (PAGE_SIZE - 1))
2051                 goto out;
2052         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
2053                 goto out;
2054         if (alias->slot >= KVM_ALIAS_SLOTS)
2055                 goto out;
2056         if (alias->guest_phys_addr + alias->memory_size
2057             < alias->guest_phys_addr)
2058                 goto out;
2059         if (alias->target_phys_addr + alias->memory_size
2060             < alias->target_phys_addr)
2061                 goto out;
2062
2063         down_write(&kvm->slots_lock);
2064         spin_lock(&kvm->mmu_lock);
2065
2066         p = &kvm->arch.aliases[alias->slot];
2067         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2068         p->npages = alias->memory_size >> PAGE_SHIFT;
2069         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2070
2071         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2072                 if (kvm->arch.aliases[n - 1].npages)
2073                         break;
2074         kvm->arch.naliases = n;
2075
2076         spin_unlock(&kvm->mmu_lock);
2077         kvm_mmu_zap_all(kvm);
2078
2079         up_write(&kvm->slots_lock);
2080
2081         return 0;
2082
2083 out:
2084         return r;
2085 }
2086
2087 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2088 {
2089         int r;
2090
2091         r = 0;
2092         switch (chip->chip_id) {
2093         case KVM_IRQCHIP_PIC_MASTER:
2094                 memcpy(&chip->chip.pic,
2095                         &pic_irqchip(kvm)->pics[0],
2096                         sizeof(struct kvm_pic_state));
2097                 break;
2098         case KVM_IRQCHIP_PIC_SLAVE:
2099                 memcpy(&chip->chip.pic,
2100                         &pic_irqchip(kvm)->pics[1],
2101                         sizeof(struct kvm_pic_state));
2102                 break;
2103         case KVM_IRQCHIP_IOAPIC:
2104                 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2105                 break;
2106         default:
2107                 r = -EINVAL;
2108                 break;
2109         }
2110         return r;
2111 }
2112
2113 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2114 {
2115         int r;
2116
2117         r = 0;
2118         switch (chip->chip_id) {
2119         case KVM_IRQCHIP_PIC_MASTER:
2120                 spin_lock(&pic_irqchip(kvm)->lock);
2121                 memcpy(&pic_irqchip(kvm)->pics[0],
2122                         &chip->chip.pic,
2123                         sizeof(struct kvm_pic_state));
2124                 spin_unlock(&pic_irqchip(kvm)->lock);
2125                 break;
2126         case KVM_IRQCHIP_PIC_SLAVE:
2127                 spin_lock(&pic_irqchip(kvm)->lock);
2128                 memcpy(&pic_irqchip(kvm)->pics[1],
2129                         &chip->chip.pic,
2130                         sizeof(struct kvm_pic_state));
2131                 spin_unlock(&pic_irqchip(kvm)->lock);
2132                 break;
2133         case KVM_IRQCHIP_IOAPIC:
2134                 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2135                 break;
2136         default:
2137                 r = -EINVAL;
2138                 break;
2139         }
2140         kvm_pic_update_irq(pic_irqchip(kvm));
2141         return r;
2142 }
2143
2144 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2145 {
2146         int r = 0;
2147
2148         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2149         memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2150         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2151         return r;
2152 }
2153
2154 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
2155 {
2156         int r = 0;
2157
2158         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2159         memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
2160         kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2161         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2162         return r;
2163 }
2164
2165 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2166 {
2167         int r = 0;
2168
2169         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2170         memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2171                 sizeof(ps->channels));
2172         ps->flags = kvm->arch.vpit->pit_state.flags;
2173         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2174         return r;
2175 }
2176
2177 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2178 {
2179         int r = 0, start = 0;
2180         u32 prev_legacy, cur_legacy;
2181         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2182         prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2183         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2184         if (!prev_legacy && cur_legacy)
2185                 start = 1;
2186         memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2187                sizeof(kvm->arch.vpit->pit_state.channels));
2188         kvm->arch.vpit->pit_state.flags = ps->flags;
2189         kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2190         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2191         return r;
2192 }
2193
2194 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2195                                  struct kvm_reinject_control *control)
2196 {
2197         if (!kvm->arch.vpit)
2198                 return -ENXIO;
2199         mutex_lock(&kvm->arch.vpit->pit_state.lock);
2200         kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2201         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2202         return 0;
2203 }
2204
2205 /*
2206  * Get (and clear) the dirty memory log for a memory slot.
2207  */
2208 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2209                                       struct kvm_dirty_log *log)
2210 {
2211         int r;
2212         int n;
2213         struct kvm_memory_slot *memslot;
2214         int is_dirty = 0;
2215
2216         down_write(&kvm->slots_lock);
2217
2218         r = kvm_get_dirty_log(kvm, log, &is_dirty);
2219         if (r)
2220                 goto out;
2221
2222         /* If nothing is dirty, don't bother messing with page tables. */
2223         if (is_dirty) {
2224                 spin_lock(&kvm->mmu_lock);
2225                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2226                 spin_unlock(&kvm->mmu_lock);
2227                 memslot = &kvm->memslots[log->slot];
2228                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
2229                 memset(memslot->dirty_bitmap, 0, n);
2230         }
2231         r = 0;
2232 out:
2233         up_write(&kvm->slots_lock);
2234         return r;
2235 }
2236
2237 long kvm_arch_vm_ioctl(struct file *filp,
2238                        unsigned int ioctl, unsigned long arg)
2239 {
2240         struct kvm *kvm = filp->private_data;
2241         void __user *argp = (void __user *)arg;
2242         int r = -ENOTTY;
2243         /*
2244          * This union makes it completely explicit to gcc-3.x
2245          * that these two variables' stack usage should be
2246          * combined, not added together.
2247          */
2248         union {
2249                 struct kvm_pit_state ps;
2250                 struct kvm_pit_state2 ps2;
2251                 struct kvm_memory_alias alias;
2252                 struct kvm_pit_config pit_config;
2253         } u;
2254
2255         switch (ioctl) {
2256         case KVM_SET_TSS_ADDR:
2257                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
2258                 if (r < 0)
2259                         goto out;
2260                 break;
2261         case KVM_SET_IDENTITY_MAP_ADDR: {
2262                 u64 ident_addr;
2263
2264                 r = -EFAULT;
2265                 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2266                         goto out;
2267                 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2268                 if (r < 0)
2269                         goto out;
2270                 break;
2271         }
2272         case KVM_SET_MEMORY_REGION: {
2273                 struct kvm_memory_region kvm_mem;
2274                 struct kvm_userspace_memory_region kvm_userspace_mem;
2275
2276                 r = -EFAULT;
2277                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2278                         goto out;
2279                 kvm_userspace_mem.slot = kvm_mem.slot;
2280                 kvm_userspace_mem.flags = kvm_mem.flags;
2281                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2282                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2283                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2284                 if (r)
2285                         goto out;
2286                 break;
2287         }
2288         case KVM_SET_NR_MMU_PAGES:
2289                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2290                 if (r)
2291                         goto out;
2292                 break;
2293         case KVM_GET_NR_MMU_PAGES:
2294                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2295                 break;
2296         case KVM_SET_MEMORY_ALIAS:
2297                 r = -EFAULT;
2298                 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
2299                         goto out;
2300                 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
2301                 if (r)
2302                         goto out;
2303                 break;
2304         case KVM_CREATE_IRQCHIP:
2305                 r = -ENOMEM;
2306                 kvm->arch.vpic = kvm_create_pic(kvm);
2307                 if (kvm->arch.vpic) {
2308                         r = kvm_ioapic_init(kvm);
2309                         if (r) {
2310                                 kfree(kvm->arch.vpic);
2311                                 kvm->arch.vpic = NULL;
2312                                 goto out;
2313                         }
2314                 } else
2315                         goto out;
2316                 r = kvm_setup_default_irq_routing(kvm);
2317                 if (r) {
2318                         kfree(kvm->arch.vpic);
2319                         kfree(kvm->arch.vioapic);
2320                         goto out;
2321                 }
2322                 break;
2323         case KVM_CREATE_PIT:
2324                 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2325                 goto create_pit;
2326         case KVM_CREATE_PIT2:
2327                 r = -EFAULT;
2328                 if (copy_from_user(&u.pit_config, argp,
2329                                    sizeof(struct kvm_pit_config)))
2330                         goto out;
2331         create_pit:
2332                 down_write(&kvm->slots_lock);
2333                 r = -EEXIST;
2334                 if (kvm->arch.vpit)
2335                         goto create_pit_unlock;
2336                 r = -ENOMEM;
2337                 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
2338                 if (kvm->arch.vpit)
2339                         r = 0;
2340         create_pit_unlock:
2341                 up_write(&kvm->slots_lock);
2342                 break;
2343         case KVM_IRQ_LINE_STATUS:
2344         case KVM_IRQ_LINE: {
2345                 struct kvm_irq_level irq_event;
2346
2347                 r = -EFAULT;
2348                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2349                         goto out;
2350                 if (irqchip_in_kernel(kvm)) {
2351                         __s32 status;
2352                         status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2353                                         irq_event.irq, irq_event.level);
2354                         if (ioctl == KVM_IRQ_LINE_STATUS) {
2355                                 irq_event.status = status;
2356                                 if (copy_to_user(argp, &irq_event,
2357                                                         sizeof irq_event))
2358                                         goto out;
2359                         }
2360                         r = 0;
2361                 }
2362                 break;
2363         }
2364         case KVM_GET_IRQCHIP: {
2365                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2366                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2367
2368                 r = -ENOMEM;
2369                 if (!chip)
2370                         goto out;
2371                 r = -EFAULT;
2372                 if (copy_from_user(chip, argp, sizeof *chip))
2373                         goto get_irqchip_out;
2374                 r = -ENXIO;
2375                 if (!irqchip_in_kernel(kvm))
2376                         goto get_irqchip_out;
2377                 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
2378                 if (r)
2379                         goto get_irqchip_out;
2380                 r = -EFAULT;
2381                 if (copy_to_user(argp, chip, sizeof *chip))
2382                         goto get_irqchip_out;
2383                 r = 0;
2384         get_irqchip_out:
2385                 kfree(chip);
2386                 if (r)
2387                         goto out;
2388                 break;
2389         }
2390         case KVM_SET_IRQCHIP: {
2391                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
2392                 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
2393
2394                 r = -ENOMEM;
2395                 if (!chip)
2396                         goto out;
2397                 r = -EFAULT;
2398                 if (copy_from_user(chip, argp, sizeof *chip))
2399                         goto set_irqchip_out;
2400                 r = -ENXIO;
2401                 if (!irqchip_in_kernel(kvm))
2402                         goto set_irqchip_out;
2403                 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
2404                 if (r)
2405                         goto set_irqchip_out;
2406                 r = 0;
2407         set_irqchip_out:
2408                 kfree(chip);
2409                 if (r)
2410                         goto out;
2411                 break;
2412         }
2413         case KVM_GET_PIT: {
2414                 r = -EFAULT;
2415                 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
2416                         goto out;
2417                 r = -ENXIO;
2418                 if (!kvm->arch.vpit)
2419                         goto out;
2420                 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
2421                 if (r)
2422                         goto out;
2423                 r = -EFAULT;
2424                 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
2425                         goto out;
2426                 r = 0;
2427                 break;
2428         }
2429         case KVM_SET_PIT: {
2430                 r = -EFAULT;
2431                 if (copy_from_user(&u.ps, argp, sizeof u.ps))
2432                         goto out;
2433                 r = -ENXIO;
2434                 if (!kvm->arch.vpit)
2435                         goto out;
2436                 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2437                 if (r)
2438                         goto out;
2439                 r = 0;
2440                 break;
2441         }
2442         case KVM_GET_PIT2: {
2443                 r = -ENXIO;
2444                 if (!kvm->arch.vpit)
2445                         goto out;
2446                 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2447                 if (r)
2448                         goto out;
2449                 r = -EFAULT;
2450                 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2451                         goto out;
2452                 r = 0;
2453                 break;
2454         }
2455         case KVM_SET_PIT2: {
2456                 r = -EFAULT;
2457                 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2458                         goto out;
2459                 r = -ENXIO;
2460                 if (!kvm->arch.vpit)
2461                         goto out;
2462                 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2463                 if (r)
2464                         goto out;
2465                 r = 0;
2466                 break;
2467         }
2468         case KVM_REINJECT_CONTROL: {
2469                 struct kvm_reinject_control control;
2470                 r =  -EFAULT;
2471                 if (copy_from_user(&control, argp, sizeof(control)))
2472                         goto out;
2473                 r = kvm_vm_ioctl_reinject(kvm, &control);
2474                 if (r)
2475                         goto out;
2476                 r = 0;
2477                 break;
2478         }
2479         case KVM_XEN_HVM_CONFIG: {
2480                 r = -EFAULT;
2481                 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2482                                    sizeof(struct kvm_xen_hvm_config)))
2483                         goto out;
2484                 r = -EINVAL;
2485                 if (kvm->arch.xen_hvm_config.flags)
2486                         goto out;
2487                 r = 0;
2488                 break;
2489         }
2490         default:
2491                 ;
2492         }
2493 out:
2494         return r;
2495 }
2496
2497 static void kvm_init_msr_list(void)
2498 {
2499         u32 dummy[2];
2500         unsigned i, j;
2501
2502         /* skip the first msrs in the list. KVM-specific */
2503         for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2504                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2505                         continue;
2506                 if (j < i)
2507                         msrs_to_save[j] = msrs_to_save[i];
2508                 j++;
2509         }
2510         num_msrs_to_save = j;
2511 }
2512
2513 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2514                            const void *v)
2515 {
2516         if (vcpu->arch.apic &&
2517             !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2518                 return 0;
2519
2520         return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2521 }
2522
2523 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2524 {
2525         if (vcpu->arch.apic &&
2526             !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2527                 return 0;
2528
2529         return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2530 }
2531
2532 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2533                                struct kvm_vcpu *vcpu)
2534 {
2535         void *data = val;
2536         int r = X86EMUL_CONTINUE;
2537
2538         while (bytes) {
2539                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2540                 unsigned offset = addr & (PAGE_SIZE-1);
2541                 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2542                 int ret;
2543
2544                 if (gpa == UNMAPPED_GVA) {
2545                         r = X86EMUL_PROPAGATE_FAULT;
2546                         goto out;
2547                 }
2548                 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2549                 if (ret < 0) {
2550                         r = X86EMUL_UNHANDLEABLE;
2551                         goto out;
2552                 }
2553
2554                 bytes -= toread;
2555                 data += toread;
2556                 addr += toread;
2557         }
2558 out:
2559         return r;
2560 }
2561
2562 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2563                                 struct kvm_vcpu *vcpu)
2564 {
2565         void *data = val;
2566         int r = X86EMUL_CONTINUE;
2567
2568         while (bytes) {
2569                 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2570                 unsigned offset = addr & (PAGE_SIZE-1);
2571                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2572                 int ret;
2573
2574                 if (gpa == UNMAPPED_GVA) {
2575                         r = X86EMUL_PROPAGATE_FAULT;
2576                         goto out;
2577                 }
2578                 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2579                 if (ret < 0) {
2580                         r = X86EMUL_UNHANDLEABLE;
2581                         goto out;
2582                 }
2583
2584                 bytes -= towrite;
2585                 data += towrite;
2586                 addr += towrite;
2587         }
2588 out:
2589         return r;
2590 }
2591
2592
2593 static int emulator_read_emulated(unsigned long addr,
2594                                   void *val,
2595                                   unsigned int bytes,
2596                                   struct kvm_vcpu *vcpu)
2597 {
2598         gpa_t                 gpa;
2599
2600         if (vcpu->mmio_read_completed) {
2601                 memcpy(val, vcpu->mmio_data, bytes);
2602                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2603                                vcpu->mmio_phys_addr, *(u64 *)val);
2604                 vcpu->mmio_read_completed = 0;
2605                 return X86EMUL_CONTINUE;
2606         }
2607
2608         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2609
2610         /* For APIC access vmexit */
2611         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2612                 goto mmio;
2613
2614         if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2615                                 == X86EMUL_CONTINUE)
2616                 return X86EMUL_CONTINUE;
2617         if (gpa == UNMAPPED_GVA)
2618                 return X86EMUL_PROPAGATE_FAULT;
2619
2620 mmio:
2621         /*
2622          * Is this MMIO handled locally?
2623          */
2624         if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2625                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2626                 return X86EMUL_CONTINUE;
2627         }
2628
2629         trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2630
2631         vcpu->mmio_needed = 1;
2632         vcpu->mmio_phys_addr = gpa;
2633         vcpu->mmio_size = bytes;
2634         vcpu->mmio_is_write = 0;
2635
2636         return X86EMUL_UNHANDLEABLE;
2637 }
2638
2639 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2640                           const void *val, int bytes)
2641 {
2642         int ret;
2643
2644         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2645         if (ret < 0)
2646                 return 0;
2647         kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2648         return 1;
2649 }
2650
2651 static int emulator_write_emulated_onepage(unsigned long addr,
2652                                            const void *val,
2653                                            unsigned int bytes,
2654                                            struct kvm_vcpu *vcpu)
2655 {
2656         gpa_t                 gpa;
2657
2658         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2659
2660         if (gpa == UNMAPPED_GVA) {
2661                 kvm_inject_page_fault(vcpu, addr, 2);
2662                 return X86EMUL_PROPAGATE_FAULT;
2663         }
2664
2665         /* For APIC access vmexit */
2666         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2667                 goto mmio;
2668
2669         if (emulator_write_phys(vcpu, gpa, val, bytes))
2670                 return X86EMUL_CONTINUE;
2671
2672 mmio:
2673         trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2674         /*
2675          * Is this MMIO handled locally?
2676          */
2677         if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2678                 return X86EMUL_CONTINUE;
2679
2680         vcpu->mmio_needed = 1;
2681         vcpu->mmio_phys_addr = gpa;
2682         vcpu->mmio_size = bytes;
2683         vcpu->mmio_is_write = 1;
2684         memcpy(vcpu->mmio_data, val, bytes);
2685
2686         return X86EMUL_CONTINUE;
2687 }
2688
2689 int emulator_write_emulated(unsigned long addr,
2690                                    const void *val,
2691                                    unsigned int bytes,
2692                                    struct kvm_vcpu *vcpu)
2693 {
2694         /* Crossing a page boundary? */
2695         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2696                 int rc, now;
2697
2698                 now = -addr & ~PAGE_MASK;
2699                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2700                 if (rc != X86EMUL_CONTINUE)
2701                         return rc;
2702                 addr += now;
2703                 val += now;
2704                 bytes -= now;
2705         }
2706         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2707 }
2708 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2709
2710 static int emulator_cmpxchg_emulated(unsigned long addr,
2711                                      const void *old,
2712                                      const void *new,
2713                                      unsigned int bytes,
2714                                      struct kvm_vcpu *vcpu)
2715 {
2716         printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
2717 #ifndef CONFIG_X86_64
2718         /* guests cmpxchg8b have to be emulated atomically */
2719         if (bytes == 8) {
2720                 gpa_t gpa;
2721                 struct page *page;
2722                 char *kaddr;
2723                 u64 val;
2724
2725                 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2726
2727                 if (gpa == UNMAPPED_GVA ||
2728                    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2729                         goto emul_write;
2730
2731                 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2732                         goto emul_write;
2733
2734                 val = *(u64 *)new;
2735
2736                 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2737
2738                 kaddr = kmap_atomic(page, KM_USER0);
2739                 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2740                 kunmap_atomic(kaddr, KM_USER0);
2741                 kvm_release_page_dirty(page);
2742         }
2743 emul_write:
2744 #endif
2745
2746         return emulator_write_emulated(addr, new, bytes, vcpu);
2747 }
2748
2749 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2750 {
2751         return kvm_x86_ops->get_segment_base(vcpu, seg);
2752 }
2753
2754 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2755 {
2756         kvm_mmu_invlpg(vcpu, address);
2757         return X86EMUL_CONTINUE;
2758 }
2759
2760 int emulate_clts(struct kvm_vcpu *vcpu)
2761 {
2762         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2763         return X86EMUL_CONTINUE;
2764 }
2765
2766 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2767 {
2768         struct kvm_vcpu *vcpu = ctxt->vcpu;
2769
2770         switch (dr) {
2771         case 0 ... 3:
2772                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2773                 return X86EMUL_CONTINUE;
2774         default:
2775                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2776                 return X86EMUL_UNHANDLEABLE;
2777         }
2778 }
2779
2780 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2781 {
2782         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2783         int exception;
2784
2785         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2786         if (exception) {
2787                 /* FIXME: better handling */
2788                 return X86EMUL_UNHANDLEABLE;
2789         }
2790         return X86EMUL_CONTINUE;
2791 }
2792
2793 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2794 {
2795         u8 opcodes[4];
2796         unsigned long rip = kvm_rip_read(vcpu);
2797         unsigned long rip_linear;
2798
2799         if (!printk_ratelimit())
2800                 return;
2801
2802         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2803
2804         kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2805
2806         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2807                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2808 }
2809 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2810
2811 static struct x86_emulate_ops emulate_ops = {
2812         .read_std            = kvm_read_guest_virt,
2813         .read_emulated       = emulator_read_emulated,
2814         .write_emulated      = emulator_write_emulated,
2815         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
2816 };
2817
2818 static void cache_all_regs(struct kvm_vcpu *vcpu)
2819 {
2820         kvm_register_read(vcpu, VCPU_REGS_RAX);
2821         kvm_register_read(vcpu, VCPU_REGS_RSP);
2822         kvm_register_read(vcpu, VCPU_REGS_RIP);
2823         vcpu->arch.regs_dirty = ~0;
2824 }
2825
2826 int emulate_instruction(struct kvm_vcpu *vcpu,
2827                         unsigned long cr2,
2828                         u16 error_code,
2829                         int emulation_type)
2830 {
2831         int r, shadow_mask;
2832         struct decode_cache *c;
2833         struct kvm_run *run = vcpu->run;
2834
2835         kvm_clear_exception_queue(vcpu);
2836         vcpu->arch.mmio_fault_cr2 = cr2;
2837         /*
2838          * TODO: fix emulate.c to use guest_read/write_register
2839          * instead of direct ->regs accesses, can save hundred cycles
2840          * on Intel for instructions that don't read/change RSP, for
2841          * for example.
2842          */
2843         cache_all_regs(vcpu);
2844
2845         vcpu->mmio_is_write = 0;
2846         vcpu->arch.pio.string = 0;
2847
2848         if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2849                 int cs_db, cs_l;
2850                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2851
2852                 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2853                 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2854                 vcpu->arch.emulate_ctxt.mode =
2855                         (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2856                         ? X86EMUL_MODE_REAL : cs_l
2857                         ? X86EMUL_MODE_PROT64 : cs_db
2858                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2859
2860                 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2861
2862                 /* Only allow emulation of specific instructions on #UD
2863                  * (namely VMMCALL, sysenter, sysexit, syscall)*/
2864                 c = &vcpu->arch.emulate_ctxt.decode;
2865                 if (emulation_type & EMULTYPE_TRAP_UD) {
2866                         if (!c->twobyte)
2867                                 return EMULATE_FAIL;
2868                         switch (c->b) {
2869                         case 0x01: /* VMMCALL */
2870                                 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2871                                         return EMULATE_FAIL;
2872                                 break;
2873                         case 0x34: /* sysenter */
2874                         case 0x35: /* sysexit */
2875                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2876                                         return EMULATE_FAIL;
2877                                 break;
2878                         case 0x05: /* syscall */
2879                                 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2880                                         return EMULATE_FAIL;
2881                                 break;
2882                         default:
2883                                 return EMULATE_FAIL;
2884                         }
2885
2886                         if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
2887                                 return EMULATE_FAIL;
2888                 }
2889
2890                 ++vcpu->stat.insn_emulation;
2891                 if (r)  {
2892                         ++vcpu->stat.insn_emulation_fail;
2893                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2894                                 return EMULATE_DONE;
2895                         return EMULATE_FAIL;
2896                 }
2897         }
2898
2899         if (emulation_type & EMULTYPE_SKIP) {
2900                 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2901                 return EMULATE_DONE;
2902         }
2903
2904         r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2905         shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2906
2907         if (r == 0)
2908                 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2909
2910         if (vcpu->arch.pio.string)
2911                 return EMULATE_DO_MMIO;
2912
2913         if ((r || vcpu->mmio_is_write) && run) {
2914                 run->exit_reason = KVM_EXIT_MMIO;
2915                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
2916                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
2917                 run->mmio.len = vcpu->mmio_size;
2918                 run->mmio.is_write = vcpu->mmio_is_write;
2919         }
2920
2921         if (r) {
2922                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2923                         return EMULATE_DONE;
2924                 if (!vcpu->mmio_needed) {
2925                         kvm_report_emulation_failure(vcpu, "mmio");
2926                         return EMULATE_FAIL;
2927                 }
2928                 return EMULATE_DO_MMIO;
2929         }
2930
2931         kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2932
2933         if (vcpu->mmio_is_write) {
2934                 vcpu->mmio_needed = 0;
2935                 return EMULATE_DO_MMIO;
2936         }
2937
2938         return EMULATE_DONE;
2939 }
2940 EXPORT_SYMBOL_GPL(emulate_instruction);
2941
2942 static int pio_copy_data(struct kvm_vcpu *vcpu)
2943 {
2944         void *p = vcpu->arch.pio_data;
2945         gva_t q = vcpu->arch.pio.guest_gva;
2946         unsigned bytes;
2947         int ret;
2948
2949         bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2950         if (vcpu->arch.pio.in)
2951                 ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2952         else
2953                 ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2954         return ret;
2955 }
2956
2957 int complete_pio(struct kvm_vcpu *vcpu)
2958 {
2959         struct kvm_pio_request *io = &vcpu->arch.pio;
2960         long delta;
2961         int r;
2962         unsigned long val;
2963
2964         if (!io->string) {
2965                 if (io->in) {
2966                         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2967                         memcpy(&val, vcpu->arch.pio_data, io->size);
2968                         kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2969                 }
2970         } else {
2971                 if (io->in) {
2972                         r = pio_copy_data(vcpu);
2973                         if (r)
2974                                 return r;
2975                 }
2976
2977                 delta = 1;
2978                 if (io->rep) {
2979                         delta *= io->cur_count;
2980                         /*
2981                          * The size of the register should really depend on
2982                          * current address size.
2983                          */
2984                         val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2985                         val -= delta;
2986                         kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2987                 }
2988                 if (io->down)
2989                         delta = -delta;
2990                 delta *= io->size;
2991                 if (io->in) {
2992                         val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2993                         val += delta;
2994                         kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2995                 } else {
2996                         val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2997                         val += delta;
2998                         kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2999                 }
3000         }
3001
3002         io->count -= io->cur_count;
3003         io->cur_count = 0;
3004
3005         return 0;
3006 }
3007
3008 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3009 {
3010         /* TODO: String I/O for in kernel device */
3011         int r;
3012
3013         if (vcpu->arch.pio.in)
3014                 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3015                                     vcpu->arch.pio.size, pd);
3016         else
3017                 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
3018                                      vcpu->arch.pio.size, pd);
3019         return r;
3020 }
3021
3022 static int pio_string_write(struct kvm_vcpu *vcpu)
3023 {
3024         struct kvm_pio_request *io = &vcpu->arch.pio;
3025         void *pd = vcpu->arch.pio_data;
3026         int i, r = 0;
3027
3028         for (i = 0; i < io->cur_count; i++) {
3029                 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
3030                                      io->port, io->size, pd)) {
3031                         r = -EOPNOTSUPP;
3032                         break;
3033                 }
3034                 pd += io->size;
3035         }
3036         return r;
3037 }
3038
3039 int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3040 {
3041         unsigned long val;
3042
3043         vcpu->run->exit_reason = KVM_EXIT_IO;
3044         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3045         vcpu->run->io.size = vcpu->arch.pio.size = size;
3046         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3047         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3048         vcpu->run->io.port = vcpu->arch.pio.port = port;
3049         vcpu->arch.pio.in = in;
3050         vcpu->arch.pio.string = 0;
3051         vcpu->arch.pio.down = 0;
3052         vcpu->arch.pio.rep = 0;
3053
3054         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3055                       size, 1);
3056
3057         val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3058         memcpy(vcpu->arch.pio_data, &val, 4);
3059
3060         if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3061                 complete_pio(vcpu);
3062                 return 1;
3063         }
3064         return 0;
3065 }
3066 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3067
3068 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3069                   int size, unsigned long count, int down,
3070                   gva_t address, int rep, unsigned port)
3071 {
3072         unsigned now, in_page;
3073         int ret = 0;
3074
3075         vcpu->run->exit_reason = KVM_EXIT_IO;
3076         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3077         vcpu->run->io.size = vcpu->arch.pio.size = size;
3078         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3079         vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3080         vcpu->run->io.port = vcpu->arch.pio.port = port;
3081         vcpu->arch.pio.in = in;
3082         vcpu->arch.pio.string = 1;
3083         vcpu->arch.pio.down = down;
3084         vcpu->arch.pio.rep = rep;
3085
3086         trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3087                       size, count);
3088
3089         if (!count) {
3090                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3091                 return 1;
3092         }
3093
3094         if (!down)
3095                 in_page = PAGE_SIZE - offset_in_page(address);
3096         else
3097                 in_page = offset_in_page(address) + size;
3098         now = min(count, (unsigned long)in_page / size);
3099         if (!now)
3100                 now = 1;
3101         if (down) {
3102                 /*
3103                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
3104                  */
3105                 pr_unimpl(vcpu, "guest string pio down\n");
3106                 kvm_inject_gp(vcpu, 0);
3107                 return 1;
3108         }
3109         vcpu->run->io.count = now;
3110         vcpu->arch.pio.cur_count = now;
3111
3112         if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3113                 kvm_x86_ops->skip_emulated_instruction(vcpu);
3114
3115         vcpu->arch.pio.guest_gva = address;
3116
3117         if (!vcpu->arch.pio.in) {
3118                 /* string PIO write */
3119                 ret = pio_copy_data(vcpu);
3120                 if (ret == X86EMUL_PROPAGATE_FAULT) {
3121                         kvm_inject_gp(vcpu, 0);
3122                         return 1;
3123                 }
3124                 if (ret == 0 && !pio_string_write(vcpu)) {
3125                         complete_pio(vcpu);
3126                         if (vcpu->arch.pio.count == 0)
3127                                 ret = 1;
3128                 }
3129         }
3130         /* no string PIO read support yet */
3131
3132         return ret;
3133 }
3134 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
3135
3136 static void bounce_off(void *info)
3137 {
3138         /* nothing */
3139 }
3140
3141 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3142                                      void *data)
3143 {
3144         struct cpufreq_freqs *freq = data;
3145         struct kvm *kvm;
3146         struct kvm_vcpu *vcpu;
3147         int i, send_ipi = 0;
3148
3149         if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3150                 return 0;
3151         if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3152                 return 0;
3153         per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3154
3155         spin_lock(&kvm_lock);
3156         list_for_each_entry(kvm, &vm_list, vm_list) {
3157                 kvm_for_each_vcpu(i, vcpu, kvm) {
3158                         if (vcpu->cpu != freq->cpu)
3159                                 continue;
3160                         if (!kvm_request_guest_time_update(vcpu))
3161                                 continue;
3162                         if (vcpu->cpu != smp_processor_id())
3163                                 send_ipi++;
3164                 }
3165         }
3166         spin_unlock(&kvm_lock);
3167
3168         if (freq->old < freq->new && send_ipi) {
3169                 /*
3170                  * We upscale the frequency.  Must make the guest
3171                  * doesn't see old kvmclock values while running with
3172                  * the new frequency, otherwise we risk the guest sees
3173                  * time go backwards.
3174                  *
3175                  * In case we update the frequency for another cpu
3176                  * (which might be in guest context) send an interrupt
3177                  * to kick the cpu out of guest context.  Next time
3178                  * guest context is entered kvmclock will be updated,
3179                  * so the guest will not see stale values.
3180                  */
3181                 smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
3182         }
3183         return 0;
3184 }
3185
3186 static struct notifier_block kvmclock_cpufreq_notifier_block = {
3187         .notifier_call  = kvmclock_cpufreq_notifier
3188 };
3189
3190 static void kvm_timer_init(void)
3191 {
3192         int cpu;
3193
3194         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3195                 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3196                                           CPUFREQ_TRANSITION_NOTIFIER);
3197                 for_each_online_cpu(cpu) {
3198                         unsigned long khz = cpufreq_get(cpu);
3199                         if (!khz)
3200                                 khz = tsc_khz;
3201                         per_cpu(cpu_tsc_khz, cpu) = khz;
3202                 }
3203         } else {
3204                 for_each_possible_cpu(cpu)
3205                         per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3206         }
3207 }
3208
3209 int kvm_arch_init(void *opaque)
3210 {
3211         int r;
3212         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3213
3214         if (kvm_x86_ops) {
3215                 printk(KERN_ERR "kvm: already loaded the other module\n");
3216                 r = -EEXIST;
3217                 goto out;
3218         }
3219
3220         if (!ops->cpu_has_kvm_support()) {
3221                 printk(KERN_ERR "kvm: no hardware support\n");
3222                 r = -EOPNOTSUPP;
3223                 goto out;
3224         }
3225         if (ops->disabled_by_bios()) {
3226                 printk(KERN_ERR "kvm: disabled by bios\n");
3227                 r = -EOPNOTSUPP;
3228                 goto out;
3229         }
3230
3231         r = kvm_mmu_module_init();
3232         if (r)
3233                 goto out;
3234
3235         kvm_init_msr_list();
3236
3237         kvm_x86_ops = ops;
3238         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3239         kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
3240         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3241                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
3242
3243         kvm_timer_init();
3244
3245         return 0;
3246
3247 out:
3248         return r;
3249 }
3250
3251 void kvm_arch_exit(void)
3252 {
3253         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
3254                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
3255                                             CPUFREQ_TRANSITION_NOTIFIER);
3256         kvm_x86_ops = NULL;
3257         kvm_mmu_module_exit();
3258 }
3259
3260 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
3261 {
3262         ++vcpu->stat.halt_exits;
3263         if (irqchip_in_kernel(vcpu->kvm)) {
3264                 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
3265                 return 1;
3266         } else {
3267                 vcpu->run->exit_reason = KVM_EXIT_HLT;
3268                 return 0;
3269         }
3270 }
3271 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
3272
3273 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3274                            unsigned long a1)
3275 {
3276         if (is_long_mode(vcpu))
3277                 return a0;
3278         else
3279                 return a0 | ((gpa_t)a1 << 32);
3280 }
3281
3282 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3283 {
3284         unsigned long nr, a0, a1, a2, a3, ret;
3285         int r = 1;
3286
3287         nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3288         a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3289         a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3290         a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3291         a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3292
3293         trace_kvm_hypercall(nr, a0, a1, a2, a3);
3294
3295         if (!is_long_mode(vcpu)) {
3296                 nr &= 0xFFFFFFFF;
3297                 a0 &= 0xFFFFFFFF;
3298                 a1 &= 0xFFFFFFFF;
3299                 a2 &= 0xFFFFFFFF;
3300                 a3 &= 0xFFFFFFFF;
3301         }
3302
3303         if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3304                 ret = -KVM_EPERM;
3305                 goto out;
3306         }
3307
3308         switch (nr) {
3309         case KVM_HC_VAPIC_POLL_IRQ:
3310                 ret = 0;
3311                 break;
3312         case KVM_HC_MMU_OP:
3313                 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
3314                 break;
3315         default:
3316                 ret = -KVM_ENOSYS;
3317                 break;
3318         }
3319 out:
3320         kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3321         ++vcpu->stat.hypercalls;
3322         return r;
3323 }
3324 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3325
3326 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3327 {
3328         char instruction[3];
3329         int ret = 0;
3330         unsigned long rip = kvm_rip_read(vcpu);
3331
3332
3333         /*
3334          * Blow out the MMU to ensure that no other VCPU has an active mapping
3335          * to ensure that the updated hypercall appears atomically across all
3336          * VCPUs.
3337          */
3338         kvm_mmu_zap_all(vcpu->kvm);
3339
3340         kvm_x86_ops->patch_hypercall(vcpu, instruction);
3341         if (emulator_write_emulated(rip, instruction, 3, vcpu)
3342             != X86EMUL_CONTINUE)
3343                 ret = -EFAULT;
3344
3345         return ret;
3346 }
3347
3348 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3349 {
3350         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3351 }
3352
3353 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3354 {
3355         struct descriptor_table dt = { limit, base };
3356
3357         kvm_x86_ops->set_gdt(vcpu, &dt);
3358 }
3359
3360 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
3361 {
3362         struct descriptor_table dt = { limit, base };
3363
3364         kvm_x86_ops->set_idt(vcpu, &dt);
3365 }
3366
3367 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3368                    unsigned long *rflags)
3369 {
3370         kvm_lmsw(vcpu, msw);
3371         *rflags = kvm_get_rflags(vcpu);
3372 }
3373
3374 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3375 {
3376         unsigned long value;
3377
3378         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3379         switch (cr) {
3380         case 0:
3381                 value = vcpu->arch.cr0;
3382                 break;
3383         case 2:
3384                 value = vcpu->arch.cr2;
3385                 break;
3386         case 3:
3387                 value = vcpu->arch.cr3;
3388                 break;
3389         case 4:
3390                 value = vcpu->arch.cr4;
3391                 break;
3392         case 8:
3393                 value = kvm_get_cr8(vcpu);
3394                 break;
3395         default:
3396                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3397                 return 0;
3398         }
3399
3400         return value;
3401 }
3402
3403 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3404                      unsigned long *rflags)
3405 {
3406         switch (cr) {
3407         case 0:
3408                 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
3409                 *rflags = kvm_get_rflags(vcpu);
3410                 break;
3411         case 2:
3412                 vcpu->arch.cr2 = val;
3413                 break;
3414         case 3:
3415                 kvm_set_cr3(vcpu, val);
3416                 break;
3417         case 4:
3418                 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
3419                 break;
3420         case 8:
3421                 kvm_set_cr8(vcpu, val & 0xfUL);
3422                 break;
3423         default:
3424                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3425         }
3426 }
3427
3428 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3429 {
3430         struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3431         int j, nent = vcpu->arch.cpuid_nent;
3432
3433         e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3434         /* when no next entry is found, the current entry[i] is reselected */
3435         for (j = i + 1; ; j = (j + 1) % nent) {
3436                 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3437                 if (ej->function == e->function) {
3438                         ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3439                         return j;
3440                 }
3441         }
3442         return 0; /* silence gcc, even though control never reaches here */
3443 }
3444
3445 /* find an entry with matching function, matching index (if needed), and that
3446  * should be read next (if it's stateful) */
3447 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3448         u32 function, u32 index)
3449 {
3450         if (e->function != function)
3451                 return 0;
3452         if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3453                 return 0;
3454         if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3455             !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3456                 return 0;
3457         return 1;
3458 }
3459
3460 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3461                                               u32 function, u32 index)
3462 {
3463         int i;
3464         struct kvm_cpuid_entry2 *best = NULL;
3465
3466         for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3467                 struct kvm_cpuid_entry2 *e;
3468
3469                 e = &vcpu->arch.cpuid_entries[i];
3470                 if (is_matching_cpuid_entry(e, function, index)) {
3471                         if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3472                                 move_to_next_stateful_cpuid_entry(vcpu, i);
3473                         best = e;
3474                         break;
3475                 }
3476                 /*
3477                  * Both basic or both extended?
3478                  */
3479                 if (((e->function ^ function) & 0x80000000) == 0)
3480                         if (!best || e->function > best->function)
3481                                 best = e;
3482         }
3483         return best;
3484 }
3485
3486 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3487 {
3488         struct kvm_cpuid_entry2 *best;
3489
3490         best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3491         if (best)
3492                 return best->eax & 0xff;
3493         return 36;
3494 }
3495
3496 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3497 {
3498         u32 function, index;
3499         struct kvm_cpuid_entry2 *best;
3500
3501         function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3502         index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3503         kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3504         kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3505         kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3506         kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3507         best = kvm_find_cpuid_entry(vcpu, function, index);
3508         if (best) {
3509                 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3510                 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3511                 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3512                 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3513         }
3514         kvm_x86_ops->skip_emulated_instruction(vcpu);
3515         trace_kvm_cpuid(function,
3516                         kvm_register_read(vcpu, VCPU_REGS_RAX),
3517                         kvm_register_read(vcpu, VCPU_REGS_RBX),
3518                         kvm_register_read(vcpu, VCPU_REGS_RCX),
3519                         kvm_register_read(vcpu, VCPU_REGS_RDX));
3520 }
3521 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3522
3523 /*
3524  * Check if userspace requested an interrupt window, and that the
3525  * interrupt window is open.
3526  *
3527  * No need to exit to userspace if we already have an interrupt queued.
3528  */
3529 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3530 {
3531         return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3532                 vcpu->run->request_interrupt_window &&
3533                 kvm_arch_interrupt_allowed(vcpu));
3534 }
3535
3536 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3537 {
3538         struct kvm_run *kvm_run = vcpu->run;
3539
3540         kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3541         kvm_run->cr8 = kvm_get_cr8(vcpu);
3542         kvm_run->apic_base = kvm_get_apic_base(vcpu);
3543         if (irqchip_in_kernel(vcpu->kvm))
3544                 kvm_run->ready_for_interrupt_injection = 1;
3545         else
3546                 kvm_run->ready_for_interrupt_injection =
3547                         kvm_arch_interrupt_allowed(vcpu) &&
3548                         !kvm_cpu_has_interrupt(vcpu) &&
3549                         !kvm_event_needs_reinjection(vcpu);
3550 }
3551
3552 static void vapic_enter(struct kvm_vcpu *vcpu)
3553 {
3554         struct kvm_lapic *apic = vcpu->arch.apic;
3555         struct page *page;
3556
3557         if (!apic || !apic->vapic_addr)
3558                 return;
3559
3560         page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3561
3562         vcpu->arch.apic->vapic_page = page;
3563 }
3564
3565 static void vapic_exit(struct kvm_vcpu *vcpu)
3566 {
3567         struct kvm_lapic *apic = vcpu->arch.apic;
3568
3569         if (!apic || !apic->vapic_addr)
3570                 return;
3571
3572         down_read(&vcpu->kvm->slots_lock);
3573         kvm_release_page_dirty(apic->vapic_page);
3574         mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3575         up_read(&vcpu->kvm->slots_lock);
3576 }
3577
3578 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3579 {
3580         int max_irr, tpr;
3581
3582         if (!kvm_x86_ops->update_cr8_intercept)
3583                 return;
3584
3585         if (!vcpu->arch.apic)
3586                 return;
3587
3588         if (!vcpu->arch.apic->vapic_addr)
3589                 max_irr = kvm_lapic_find_highest_irr(vcpu);
3590         else
3591                 max_irr = -1;
3592
3593         if (max_irr != -1)
3594                 max_irr >>= 4;
3595
3596         tpr = kvm_lapic_get_cr8(vcpu);
3597
3598         kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3599 }
3600
3601 static void inject_pending_event(struct kvm_vcpu *vcpu)
3602 {
3603         /* try to reinject previous events if any */
3604         if (vcpu->arch.exception.pending) {
3605                 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3606                                           vcpu->arch.exception.has_error_code,
3607                                           vcpu->arch.exception.error_code);
3608                 return;
3609         }
3610
3611         if (vcpu->arch.nmi_injected) {
3612                 kvm_x86_ops->set_nmi(vcpu);
3613                 return;
3614         }
3615
3616         if (vcpu->arch.interrupt.pending) {
3617                 kvm_x86_ops->set_irq(vcpu);
3618                 return;
3619         }
3620
3621         /* try to inject new event if pending */
3622         if (vcpu->arch.nmi_pending) {
3623                 if (kvm_x86_ops->nmi_allowed(vcpu)) {
3624                         vcpu->arch.nmi_pending = false;
3625                         vcpu->arch.nmi_injected = true;
3626                         kvm_x86_ops->set_nmi(vcpu);
3627                 }
3628         } else if (kvm_cpu_has_interrupt(vcpu)) {
3629                 if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3630                         kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3631                                             false);
3632                         kvm_x86_ops->set_irq(vcpu);
3633                 }
3634         }
3635 }
3636
3637 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3638 {
3639         int r;
3640         bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3641                 vcpu->run->request_interrupt_window;
3642
3643         if (vcpu->requests)
3644                 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3645                         kvm_mmu_unload(vcpu);
3646
3647         r = kvm_mmu_reload(vcpu);
3648         if (unlikely(r))
3649                 goto out;
3650
3651         if (vcpu->requests) {
3652                 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3653                         __kvm_migrate_timers(vcpu);
3654                 if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3655                         kvm_write_guest_time(vcpu);
3656                 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3657                         kvm_mmu_sync_roots(vcpu);
3658                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3659                         kvm_x86_ops->tlb_flush(vcpu);
3660                 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3661                                        &vcpu->requests)) {
3662                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3663                         r = 0;
3664                         goto out;
3665                 }
3666                 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3667                         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3668                         r = 0;
3669                         goto out;
3670                 }
3671         }
3672
3673         preempt_disable();
3674
3675         kvm_x86_ops->prepare_guest_switch(vcpu);
3676         kvm_load_guest_fpu(vcpu);
3677
3678         local_irq_disable();
3679
3680         clear_bit(KVM_REQ_KICK, &vcpu->requests);
3681         smp_mb__after_clear_bit();
3682
3683         if (vcpu->requests || need_resched() || signal_pending(current)) {
3684                 set_bit(KVM_REQ_KICK, &vcpu->requests);
3685                 local_irq_enable();
3686                 preempt_enable();
3687                 r = 1;
3688                 goto out;
3689         }
3690
3691         inject_pending_event(vcpu);
3692
3693         /* enable NMI/IRQ window open exits if needed */
3694         if (vcpu->arch.nmi_pending)
3695                 kvm_x86_ops->enable_nmi_window(vcpu);
3696         else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3697                 kvm_x86_ops->enable_irq_window(vcpu);
3698
3699         if (kvm_lapic_enabled(vcpu)) {
3700                 update_cr8_intercept(vcpu);
3701                 kvm_lapic_sync_to_vapic(vcpu);
3702         }
3703
3704         up_read(&vcpu->kvm->slots_lock);
3705
3706         kvm_guest_enter();
3707
3708         if (unlikely(vcpu->arch.switch_db_regs)) {
3709                 set_debugreg(0, 7);
3710                 set_debugreg(vcpu->arch.eff_db[0], 0);
3711                 set_debugreg(vcpu->arch.eff_db[1], 1);
3712                 set_debugreg(vcpu->arch.eff_db[2], 2);
3713                 set_debugreg(vcpu->arch.eff_db[3], 3);
3714         }
3715
3716         trace_kvm_entry(vcpu->vcpu_id);
3717         kvm_x86_ops->run(vcpu);
3718
3719         if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3720                 set_debugreg(current->thread.debugreg0, 0);
3721                 set_debugreg(current->thread.debugreg1, 1);
3722                 set_debugreg(current->thread.debugreg2, 2);
3723                 set_debugreg(current->thread.debugreg3, 3);
3724                 set_debugreg(current->thread.debugreg6, 6);
3725                 set_debugreg(current->thread.debugreg7, 7);
3726         }
3727
3728         set_bit(KVM_REQ_KICK, &vcpu->requests);
3729         local_irq_enable();
3730
3731         ++vcpu->stat.exits;
3732
3733         /*
3734          * We must have an instruction between local_irq_enable() and
3735          * kvm_guest_exit(), so the timer interrupt isn't delayed by
3736          * the interrupt shadow.  The stat.exits increment will do nicely.
3737          * But we need to prevent reordering, hence this barrier():
3738          */
3739         barrier();
3740
3741         kvm_guest_exit();
3742
3743         preempt_enable();
3744
3745         down_read(&vcpu->kvm->slots_lock);
3746
3747         /*
3748          * Profile KVM exit RIPs:
3749          */
3750         if (unlikely(prof_on == KVM_PROFILING)) {
3751                 unsigned long rip = kvm_rip_read(vcpu);
3752                 profile_hit(KVM_PROFILING, (void *)rip);
3753         }
3754
3755
3756         kvm_lapic_sync_from_vapic(vcpu);
3757
3758         r = kvm_x86_ops->handle_exit(vcpu);
3759 out:
3760         return r;
3761 }
3762
3763
3764 static int __vcpu_run(struct kvm_vcpu *vcpu)
3765 {
3766         int r;
3767
3768         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3769                 pr_debug("vcpu %d received sipi with vector # %x\n",
3770                          vcpu->vcpu_id, vcpu->arch.sipi_vector);
3771                 kvm_lapic_reset(vcpu);
3772                 r = kvm_arch_vcpu_reset(vcpu);
3773                 if (r)
3774                         return r;
3775                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3776         }
3777
3778         down_read(&vcpu->kvm->slots_lock);
3779         vapic_enter(vcpu);
3780
3781         r = 1;
3782         while (r > 0) {
3783                 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3784                         r = vcpu_enter_guest(vcpu);
3785                 else {
3786                         up_read(&vcpu->kvm->slots_lock);
3787                         kvm_vcpu_block(vcpu);
3788                         down_read(&vcpu->kvm->slots_lock);
3789                         if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3790                         {
3791                                 switch(vcpu->arch.mp_state) {
3792                                 case KVM_MP_STATE_HALTED:
3793                                         vcpu->arch.mp_state =
3794                                                 KVM_MP_STATE_RUNNABLE;
3795                                 case KVM_MP_STATE_RUNNABLE:
3796                                         break;
3797                                 case KVM_MP_STATE_SIPI_RECEIVED:
3798                                 default:
3799                                         r = -EINTR;
3800                                         break;
3801                                 }
3802                         }
3803                 }
3804
3805                 if (r <= 0)
3806                         break;
3807
3808                 clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3809                 if (kvm_cpu_has_pending_timer(vcpu))
3810                         kvm_inject_pending_timer_irqs(vcpu);
3811
3812                 if (dm_request_for_irq_injection(vcpu)) {
3813                         r = -EINTR;
3814                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3815                         ++vcpu->stat.request_irq_exits;
3816                 }
3817                 if (signal_pending(current)) {
3818                         r = -EINTR;
3819                         vcpu->run->exit_reason = KVM_EXIT_INTR;
3820                         ++vcpu->stat.signal_exits;
3821                 }
3822                 if (need_resched()) {
3823                         up_read(&vcpu->kvm->slots_lock);
3824                         kvm_resched(vcpu);
3825                         down_read(&vcpu->kvm->slots_lock);
3826                 }
3827         }
3828
3829         up_read(&vcpu->kvm->slots_lock);
3830         post_kvm_run_save(vcpu);
3831
3832         vapic_exit(vcpu);
3833
3834         return r;
3835 }
3836
3837 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3838 {
3839         int r;
3840         sigset_t sigsaved;
3841
3842         vcpu_load(vcpu);
3843
3844         if (vcpu->sigset_active)
3845                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3846
3847         if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3848                 kvm_vcpu_block(vcpu);
3849                 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3850                 r = -EAGAIN;
3851                 goto out;
3852         }
3853
3854         /* re-sync apic's tpr */
3855         if (!irqchip_in_kernel(vcpu->kvm))
3856                 kvm_set_cr8(vcpu, kvm_run->cr8);
3857
3858         if (vcpu->arch.pio.cur_count) {
3859                 r = complete_pio(vcpu);
3860                 if (r)
3861                         goto out;
3862         }
3863         if (vcpu->mmio_needed) {
3864                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3865                 vcpu->mmio_read_completed = 1;
3866                 vcpu->mmio_needed = 0;
3867
3868                 down_read(&vcpu->kvm->slots_lock);
3869                 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3870                                         EMULTYPE_NO_DECODE);
3871                 up_read(&vcpu->kvm->slots_lock);
3872                 if (r == EMULATE_DO_MMIO) {
3873                         /*
3874                          * Read-modify-write.  Back to userspace.
3875                          */
3876                         r = 0;
3877                         goto out;
3878                 }
3879         }
3880         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3881                 kvm_register_write(vcpu, VCPU_REGS_RAX,
3882                                      kvm_run->hypercall.ret);
3883
3884         r = __vcpu_run(vcpu);
3885
3886 out:
3887         if (vcpu->sigset_active)
3888                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3889
3890         vcpu_put(vcpu);
3891         return r;
3892 }
3893
3894 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3895 {
3896         vcpu_load(vcpu);
3897
3898         regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3899         regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3900         regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3901         regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3902         regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3903         regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3904         regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3905         regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3906 #ifdef CONFIG_X86_64
3907         regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3908         regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3909         regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3910         regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3911         regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3912         regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3913         regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3914         regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3915 #endif
3916
3917         regs->rip = kvm_rip_read(vcpu);
3918         regs->rflags = kvm_get_rflags(vcpu);
3919
3920         vcpu_put(vcpu);
3921
3922         return 0;
3923 }
3924
3925 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3926 {
3927         vcpu_load(vcpu);
3928
3929         kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3930         kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3931         kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3932         kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3933         kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3934         kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3935         kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3936         kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3937 #ifdef CONFIG_X86_64
3938         kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3939         kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3940         kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3941         kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3942         kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3943         kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3944         kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3945         kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3946 #endif
3947
3948         kvm_rip_write(vcpu, regs->rip);
3949         kvm_set_rflags(vcpu, regs->rflags);
3950
3951         vcpu->arch.exception.pending = false;
3952
3953         vcpu_put(vcpu);
3954
3955         return 0;
3956 }
3957
3958 void kvm_get_segment(struct kvm_vcpu *vcpu,
3959                      struct kvm_segment *var, int seg)
3960 {
3961         kvm_x86_ops->get_segment(vcpu, var, seg);
3962 }
3963
3964 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3965 {
3966         struct kvm_segment cs;
3967
3968         kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3969         *db = cs.db;
3970         *l = cs.l;
3971 }
3972 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3973
3974 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3975                                   struct kvm_sregs *sregs)
3976 {
3977         struct descriptor_table dt;
3978
3979         vcpu_load(vcpu);
3980
3981         kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3982         kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3983         kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3984         kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3985         kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3986         kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3987
3988         kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3989         kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3990
3991         kvm_x86_ops->get_idt(vcpu, &dt);
3992         sregs->idt.limit = dt.limit;
3993         sregs->idt.base = dt.base;
3994         kvm_x86_ops->get_gdt(vcpu, &dt);
3995         sregs->gdt.limit = dt.limit;
3996         sregs->gdt.base = dt.base;
3997
3998         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3999         sregs->cr0 = vcpu->arch.cr0;
4000         sregs->cr2 = vcpu->arch.cr2;
4001         sregs->cr3 = vcpu->arch.cr3;
4002         sregs->cr4 = vcpu->arch.cr4;
4003         sregs->cr8 = kvm_get_cr8(vcpu);
4004         sregs->efer = vcpu->arch.shadow_efer;
4005         sregs->apic_base = kvm_get_apic_base(vcpu);
4006
4007         memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
4008
4009         if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
4010                 set_bit(vcpu->arch.interrupt.nr,
4011                         (unsigned long *)sregs->interrupt_bitmap);
4012
4013         vcpu_put(vcpu);
4014
4015         return 0;
4016 }
4017
4018 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
4019                                     struct kvm_mp_state *mp_state)
4020 {
4021         vcpu_load(vcpu);
4022         mp_state->mp_state = vcpu->arch.mp_state;
4023         vcpu_put(vcpu);
4024         return 0;
4025 }
4026
4027 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4028                                     struct kvm_mp_state *mp_state)
4029 {
4030         vcpu_load(vcpu);
4031         vcpu->arch.mp_state = mp_state->mp_state;
4032         vcpu_put(vcpu);
4033         return 0;
4034 }
4035
4036 static void kvm_set_segment(struct kvm_vcpu *vcpu,
4037                         struct kvm_segment *var, int seg)
4038 {
4039         kvm_x86_ops->set_segment(vcpu, var, seg);
4040 }
4041
4042 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4043                                    struct kvm_segment *kvm_desct)
4044 {
4045         kvm_desct->base = get_desc_base(seg_desc);
4046         kvm_desct->limit = get_desc_limit(seg_desc);
4047         if (seg_desc->g) {
4048                 kvm_desct->limit <<= 12;
4049                 kvm_desct->limit |= 0xfff;
4050         }
4051         kvm_desct->selector = selector;
4052         kvm_desct->type = seg_desc->type;
4053         kvm_desct->present = seg_desc->p;
4054         kvm_desct->dpl = seg_desc->dpl;
4055         kvm_desct->db = seg_desc->d;
4056         kvm_desct->s = seg_desc->s;
4057         kvm_desct->l = seg_desc->l;
4058         kvm_desct->g = seg_desc->g;
4059         kvm_desct->avl = seg_desc->avl;
4060         if (!selector)
4061                 kvm_desct->unusable = 1;
4062         else
4063                 kvm_desct->unusable = 0;
4064         kvm_desct->padding = 0;
4065 }
4066
4067 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4068                                           u16 selector,
4069                                           struct descriptor_table *dtable)
4070 {
4071         if (selector & 1 << 2) {
4072                 struct kvm_segment kvm_seg;
4073
4074                 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4075
4076                 if (kvm_seg.unusable)
4077                         dtable->limit = 0;
4078                 else
4079                         dtable->limit = kvm_seg.limit;
4080                 dtable->base = kvm_seg.base;
4081         }
4082         else
4083                 kvm_x86_ops->get_gdt(vcpu, dtable);
4084 }
4085
4086 /* allowed just for 8 bytes segments */
4087 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4088                                          struct desc_struct *seg_desc)
4089 {
4090         struct descriptor_table dtable;
4091         u16 index = selector >> 3;
4092
4093         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4094
4095         if (dtable.limit < index * 8 + 7) {
4096                 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4097                 return 1;
4098         }
4099         return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4100 }
4101
4102 /* allowed just for 8 bytes segments */
4103 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4104                                          struct desc_struct *seg_desc)
4105 {
4106         struct descriptor_table dtable;
4107         u16 index = selector >> 3;
4108
4109         get_segment_descriptor_dtable(vcpu, selector, &dtable);
4110
4111         if (dtable.limit < index * 8 + 7)
4112                 return 1;
4113         return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
4114 }
4115
4116 static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu,
4117                              struct desc_struct *seg_desc)
4118 {
4119         u32 base_addr = get_desc_base(seg_desc);
4120
4121         return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
4122 }
4123
4124 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4125 {
4126         struct kvm_segment kvm_seg;
4127
4128         kvm_get_segment(vcpu, &kvm_seg, seg);
4129         return kvm_seg.selector;
4130 }
4131
4132 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4133                                                 u16 selector,
4134                                                 struct kvm_segment *kvm_seg)
4135 {
4136         struct desc_struct seg_desc;
4137
4138         if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4139                 return 1;
4140         seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4141         return 0;
4142 }
4143
4144 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4145 {
4146         struct kvm_segment segvar = {
4147                 .base = selector << 4,
4148                 .limit = 0xffff,
4149                 .selector = selector,
4150                 .type = 3,
4151                 .present = 1,
4152                 .dpl = 3,
4153                 .db = 0,
4154                 .s = 1,
4155                 .l = 0,
4156                 .g = 0,
4157                 .avl = 0,
4158                 .unusable = 0,
4159         };
4160         kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4161         return 0;
4162 }
4163
4164 static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4165 {
4166         return (seg != VCPU_SREG_LDTR) &&
4167                 (seg != VCPU_SREG_TR) &&
4168                 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4169 }
4170
4171 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4172                                 int type_bits, int seg)
4173 {
4174         struct kvm_segment kvm_seg;
4175
4176         if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
4177                 return kvm_load_realmode_segment(vcpu, selector, seg);
4178         if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4179                 return 1;
4180         kvm_seg.type |= type_bits;
4181
4182         if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
4183             seg != VCPU_SREG_LDTR)
4184                 if (!kvm_seg.s)
4185                         kvm_seg.unusable = 1;
4186
4187         kvm_set_segment(vcpu, &kvm_seg, seg);
4188         return 0;
4189 }
4190
4191 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4192                                 struct tss_segment_32 *tss)
4193 {
4194         tss->cr3 = vcpu->arch.cr3;
4195         tss->eip = kvm_rip_read(vcpu);
4196         tss->eflags = kvm_get_rflags(vcpu);
4197         tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4198         tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4199         tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4200         tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4201         tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4202         tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4203         tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4204         tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4205         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4206         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4207         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4208         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4209         tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4210         tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4211         tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4212 }
4213
4214 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4215                                   struct tss_segment_32 *tss)
4216 {
4217         kvm_set_cr3(vcpu, tss->cr3);
4218
4219         kvm_rip_write(vcpu, tss->eip);
4220         kvm_set_rflags(vcpu, tss->eflags | 2);
4221
4222         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4223         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4224         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4225         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4226         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4227         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4228         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4229         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4230
4231         if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
4232                 return 1;
4233
4234         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4235                 return 1;
4236
4237         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4238                 return 1;
4239
4240         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4241                 return 1;
4242
4243         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4244                 return 1;
4245
4246         if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
4247                 return 1;
4248
4249         if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
4250                 return 1;
4251         return 0;
4252 }
4253
4254 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4255                                 struct tss_segment_16 *tss)
4256 {
4257         tss->ip = kvm_rip_read(vcpu);
4258         tss->flag = kvm_get_rflags(vcpu);
4259         tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4260         tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4261         tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4262         tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4263         tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4264         tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4265         tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
4266         tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
4267
4268         tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4269         tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4270         tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4271         tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4272         tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4273 }
4274
4275 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4276                                  struct tss_segment_16 *tss)
4277 {
4278         kvm_rip_write(vcpu, tss->ip);
4279         kvm_set_rflags(vcpu, tss->flag | 2);
4280         kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4281         kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4282         kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
4283         kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
4284         kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
4285         kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
4286         kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4287         kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4288
4289         if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
4290                 return 1;
4291
4292         if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
4293                 return 1;
4294
4295         if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
4296                 return 1;
4297
4298         if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
4299                 return 1;
4300
4301         if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
4302                 return 1;
4303         return 0;
4304 }
4305
4306 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4307                               u16 old_tss_sel, u32 old_tss_base,
4308                               struct desc_struct *nseg_desc)
4309 {
4310         struct tss_segment_16 tss_segment_16;
4311         int ret = 0;
4312
4313         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4314                            sizeof tss_segment_16))
4315                 goto out;
4316
4317         save_state_to_tss16(vcpu, &tss_segment_16);
4318
4319         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
4320                             sizeof tss_segment_16))
4321                 goto out;
4322
4323         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4324                            &tss_segment_16, sizeof tss_segment_16))
4325                 goto out;
4326
4327         if (old_tss_sel != 0xffff) {
4328                 tss_segment_16.prev_task_link = old_tss_sel;
4329
4330                 if (kvm_write_guest(vcpu->kvm,
4331                                     get_tss_base_addr(vcpu, nseg_desc),
4332                                     &tss_segment_16.prev_task_link,
4333                                     sizeof tss_segment_16.prev_task_link))
4334                         goto out;
4335         }
4336
4337         if (load_state_from_tss16(vcpu, &tss_segment_16))
4338                 goto out;
4339
4340         ret = 1;
4341 out:
4342         return ret;
4343 }
4344
4345 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4346                        u16 old_tss_sel, u32 old_tss_base,
4347                        struct desc_struct *nseg_desc)
4348 {
4349         struct tss_segment_32 tss_segment_32;
4350         int ret = 0;
4351
4352         if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4353                            sizeof tss_segment_32))
4354                 goto out;
4355
4356         save_state_to_tss32(vcpu, &tss_segment_32);
4357
4358         if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
4359                             sizeof tss_segment_32))
4360                 goto out;
4361
4362         if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
4363                            &tss_segment_32, sizeof tss_segment_32))
4364                 goto out;
4365
4366         if (old_tss_sel != 0xffff) {
4367                 tss_segment_32.prev_task_link = old_tss_sel;
4368
4369                 if (kvm_write_guest(vcpu->kvm,
4370                                     get_tss_base_addr(vcpu, nseg_desc),
4371                                     &tss_segment_32.prev_task_link,
4372                                     sizeof tss_segment_32.prev_task_link))
4373                         goto out;
4374         }
4375
4376         if (load_state_from_tss32(vcpu, &tss_segment_32))
4377                 goto out;
4378
4379         ret = 1;
4380 out:
4381         return ret;
4382 }
4383
4384 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4385 {
4386         struct kvm_segment tr_seg;
4387         struct desc_struct cseg_desc;
4388         struct desc_struct nseg_desc;
4389         int ret = 0;
4390         u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4391         u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
4392
4393         old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
4394
4395         /* FIXME: Handle errors. Failure to read either TSS or their
4396          * descriptors should generate a pagefault.
4397          */
4398         if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
4399                 goto out;
4400
4401         if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
4402                 goto out;
4403
4404         if (reason != TASK_SWITCH_IRET) {
4405                 int cpl;
4406
4407                 cpl = kvm_x86_ops->get_cpl(vcpu);
4408                 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
4409                         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4410                         return 1;
4411                 }
4412         }
4413
4414         if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4415                 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4416                 return 1;
4417         }
4418
4419         if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4420                 cseg_desc.type &= ~(1 << 1); //clear the B flag
4421                 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4422         }
4423
4424         if (reason == TASK_SWITCH_IRET) {
4425                 u32 eflags = kvm_get_rflags(vcpu);
4426                 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4427         }
4428
4429         /* set back link to prev task only if NT bit is set in eflags
4430            note that old_tss_sel is not used afetr this point */
4431         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4432                 old_tss_sel = 0xffff;
4433
4434         /* set back link to prev task only if NT bit is set in eflags
4435            note that old_tss_sel is not used afetr this point */
4436         if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4437                 old_tss_sel = 0xffff;
4438
4439         if (nseg_desc.type & 8)
4440                 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4441                                          old_tss_base, &nseg_desc);
4442         else
4443                 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4444                                          old_tss_base, &nseg_desc);
4445
4446         if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4447                 u32 eflags = kvm_get_rflags(vcpu);
4448                 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4449         }
4450
4451         if (reason != TASK_SWITCH_IRET) {
4452                 nseg_desc.type |= (1 << 1);
4453                 save_guest_segment_descriptor(vcpu, tss_selector,
4454                                               &nseg_desc);
4455         }
4456
4457         kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4458         seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4459         tr_seg.type = 11;
4460         kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4461 out:
4462         return ret;
4463 }
4464 EXPORT_SYMBOL_GPL(kvm_task_switch);
4465
4466 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4467                                   struct kvm_sregs *sregs)
4468 {
4469         int mmu_reset_needed = 0;
4470         int pending_vec, max_bits;
4471         struct descriptor_table dt;
4472
4473         vcpu_load(vcpu);
4474
4475         dt.limit = sregs->idt.limit;
4476         dt.base = sregs->idt.base;
4477         kvm_x86_ops->set_idt(vcpu, &dt);
4478         dt.limit = sregs->gdt.limit;
4479         dt.base = sregs->gdt.base;
4480         kvm_x86_ops->set_gdt(vcpu, &dt);
4481
4482         vcpu->arch.cr2 = sregs->cr2;
4483         mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4484         vcpu->arch.cr3 = sregs->cr3;
4485
4486         kvm_set_cr8(vcpu, sregs->cr8);
4487
4488         mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4489         kvm_x86_ops->set_efer(vcpu, sregs->efer);
4490         kvm_set_apic_base(vcpu, sregs->apic_base);
4491
4492         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4493
4494         mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4495         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4496         vcpu->arch.cr0 = sregs->cr0;
4497
4498         mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4499         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4500         if (!is_long_mode(vcpu) && is_pae(vcpu))
4501                 load_pdptrs(vcpu, vcpu->arch.cr3);
4502
4503         if (mmu_reset_needed)
4504                 kvm_mmu_reset_context(vcpu);
4505
4506         max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4507         pending_vec = find_first_bit(
4508                 (const unsigned long *)sregs->interrupt_bitmap, max_bits);
4509         if (pending_vec < max_bits) {
4510                 kvm_queue_interrupt(vcpu, pending_vec, false);
4511                 pr_debug("Set back pending irq %d\n", pending_vec);
4512                 if (irqchip_in_kernel(vcpu->kvm))
4513                         kvm_pic_clear_isr_ack(vcpu->kvm);
4514         }
4515
4516         kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4517         kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4518         kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4519         kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4520         kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4521         kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4522
4523         kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4524         kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4525
4526         update_cr8_intercept(vcpu);
4527
4528         /* Older userspace won't unhalt the vcpu on reset. */
4529         if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4530             sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4531             !(vcpu->arch.cr0 & X86_CR0_PE))
4532                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4533
4534         vcpu_put(vcpu);
4535
4536         return 0;
4537 }
4538
4539 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4540                                         struct kvm_guest_debug *dbg)
4541 {
4542         unsigned long rflags;
4543         int i;
4544
4545         vcpu_load(vcpu);
4546
4547         /*
4548          * Read rflags as long as potentially injected trace flags are still
4549          * filtered out.
4550          */
4551         rflags = kvm_get_rflags(vcpu);
4552
4553         vcpu->guest_debug = dbg->control;
4554         if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
4555                 vcpu->guest_debug = 0;
4556
4557         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4558                 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4559                         vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4560                 vcpu->arch.switch_db_regs =
4561                         (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4562         } else {
4563                 for (i = 0; i < KVM_NR_DB_REGS; i++)
4564                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4565                 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4566         }
4567
4568         /*
4569          * Trigger an rflags update that will inject or remove the trace
4570          * flags.
4571          */
4572         kvm_set_rflags(vcpu, rflags);
4573
4574         kvm_x86_ops->set_guest_debug(vcpu, dbg);
4575
4576         if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_DB)
4577                 kvm_queue_exception(vcpu, DB_VECTOR);
4578         else if (vcpu->guest_debug & KVM_GUESTDBG_INJECT_BP)
4579                 kvm_queue_exception(vcpu, BP_VECTOR);
4580
4581         vcpu_put(vcpu);
4582
4583         return 0;
4584 }
4585
4586 /*
4587  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4588  * we have asm/x86/processor.h
4589  */
4590 struct fxsave {
4591         u16     cwd;
4592         u16     swd;
4593         u16     twd;
4594         u16     fop;
4595         u64     rip;
4596         u64     rdp;
4597         u32     mxcsr;
4598         u32     mxcsr_mask;
4599         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
4600 #ifdef CONFIG_X86_64
4601         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
4602 #else
4603         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
4604 #endif
4605 };
4606
4607 /*
4608  * Translate a guest virtual address to a guest physical address.
4609  */
4610 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4611                                     struct kvm_translation *tr)
4612 {
4613         unsigned long vaddr = tr->linear_address;
4614         gpa_t gpa;
4615
4616         vcpu_load(vcpu);
4617         down_read(&vcpu->kvm->slots_lock);
4618         gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4619         up_read(&vcpu->kvm->slots_lock);
4620         tr->physical_address = gpa;
4621         tr->valid = gpa != UNMAPPED_GVA;
4622         tr->writeable = 1;
4623         tr->usermode = 0;
4624         vcpu_put(vcpu);
4625
4626         return 0;
4627 }
4628
4629 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4630 {
4631         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4632
4633         vcpu_load(vcpu);
4634
4635         memcpy(fpu->fpr, fxsave->st_space, 128);
4636         fpu->fcw = fxsave->cwd;
4637         fpu->fsw = fxsave->swd;
4638         fpu->ftwx = fxsave->twd;
4639         fpu->last_opcode = fxsave->fop;
4640         fpu->last_ip = fxsave->rip;
4641         fpu->last_dp = fxsave->rdp;
4642         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4643
4644         vcpu_put(vcpu);
4645
4646         return 0;
4647 }
4648
4649 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4650 {
4651         struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4652
4653         vcpu_load(vcpu);
4654
4655         memcpy(fxsave->st_space, fpu->fpr, 128);
4656         fxsave->cwd = fpu->fcw;
4657         fxsave->swd = fpu->fsw;
4658         fxsave->twd = fpu->ftwx;
4659         fxsave->fop = fpu->last_opcode;
4660         fxsave->rip = fpu->last_ip;
4661         fxsave->rdp = fpu->last_dp;
4662         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4663
4664         vcpu_put(vcpu);
4665
4666         return 0;
4667 }
4668
4669 void fx_init(struct kvm_vcpu *vcpu)
4670 {
4671         unsigned after_mxcsr_mask;
4672
4673         /*
4674          * Touch the fpu the first time in non atomic context as if
4675          * this is the first fpu instruction the exception handler
4676          * will fire before the instruction returns and it'll have to
4677          * allocate ram with GFP_KERNEL.
4678          */
4679         if (!used_math())
4680                 kvm_fx_save(&vcpu->arch.host_fx_image);
4681
4682         /* Initialize guest FPU by resetting ours and saving into guest's */
4683         preempt_disable();
4684         kvm_fx_save(&vcpu->arch.host_fx_image);
4685         kvm_fx_finit();
4686         kvm_fx_save(&vcpu->arch.guest_fx_image);
4687         kvm_fx_restore(&vcpu->arch.host_fx_image);
4688         preempt_enable();
4689
4690         vcpu->arch.cr0 |= X86_CR0_ET;
4691         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4692         vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4693         memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4694                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4695 }
4696 EXPORT_SYMBOL_GPL(fx_init);
4697
4698 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4699 {
4700         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4701                 return;
4702
4703         vcpu->guest_fpu_loaded = 1;
4704         kvm_fx_save(&vcpu->arch.host_fx_image);
4705         kvm_fx_restore(&vcpu->arch.guest_fx_image);
4706 }
4707 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4708
4709 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4710 {
4711         if (!vcpu->guest_fpu_loaded)
4712                 return;
4713
4714         vcpu->guest_fpu_loaded = 0;
4715         kvm_fx_save(&vcpu->arch.guest_fx_image);
4716         kvm_fx_restore(&vcpu->arch.host_fx_image);
4717         ++vcpu->stat.fpu_reload;
4718 }
4719 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4720
4721 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4722 {
4723         if (vcpu->arch.time_page) {
4724                 kvm_release_page_dirty(vcpu->arch.time_page);
4725                 vcpu->arch.time_page = NULL;
4726         }
4727
4728         kvm_x86_ops->vcpu_free(vcpu);
4729 }
4730
4731 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4732                                                 unsigned int id)
4733 {
4734         return kvm_x86_ops->vcpu_create(kvm, id);
4735 }
4736
4737 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4738 {
4739         int r;
4740
4741         /* We do fxsave: this must be aligned. */
4742         BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4743
4744         vcpu->arch.mtrr_state.have_fixed = 1;
4745         vcpu_load(vcpu);
4746         r = kvm_arch_vcpu_reset(vcpu);
4747         if (r == 0)
4748                 r = kvm_mmu_setup(vcpu);
4749         vcpu_put(vcpu);
4750         if (r < 0)
4751                 goto free_vcpu;
4752
4753         return 0;
4754 free_vcpu:
4755         kvm_x86_ops->vcpu_free(vcpu);
4756         return r;
4757 }
4758
4759 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4760 {
4761         vcpu_load(vcpu);
4762         kvm_mmu_unload(vcpu);
4763         vcpu_put(vcpu);
4764
4765         kvm_x86_ops->vcpu_free(vcpu);
4766 }
4767
4768 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4769 {
4770         vcpu->arch.nmi_pending = false;
4771         vcpu->arch.nmi_injected = false;
4772
4773         vcpu->arch.switch_db_regs = 0;
4774         memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4775         vcpu->arch.dr6 = DR6_FIXED_1;
4776         vcpu->arch.dr7 = DR7_FIXED_1;
4777
4778         return kvm_x86_ops->vcpu_reset(vcpu);
4779 }
4780
4781 int kvm_arch_hardware_enable(void *garbage)
4782 {
4783         /*
4784          * Since this may be called from a hotplug notifcation,
4785          * we can't get the CPU frequency directly.
4786          */
4787         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4788                 int cpu = raw_smp_processor_id();
4789                 per_cpu(cpu_tsc_khz, cpu) = 0;
4790         }
4791         return kvm_x86_ops->hardware_enable(garbage);
4792 }
4793
4794 void kvm_arch_hardware_disable(void *garbage)
4795 {
4796         kvm_x86_ops->hardware_disable(garbage);
4797 }
4798
4799 int kvm_arch_hardware_setup(void)
4800 {
4801         return kvm_x86_ops->hardware_setup();
4802 }
4803
4804 void kvm_arch_hardware_unsetup(void)
4805 {
4806         kvm_x86_ops->hardware_unsetup();
4807 }
4808
4809 void kvm_arch_check_processor_compat(void *rtn)
4810 {
4811         kvm_x86_ops->check_processor_compatibility(rtn);
4812 }
4813
4814 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4815 {
4816         struct page *page;
4817         struct kvm *kvm;
4818         int r;
4819
4820         BUG_ON(vcpu->kvm == NULL);
4821         kvm = vcpu->kvm;
4822
4823         vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4824         if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4825                 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4826         else
4827                 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4828
4829         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4830         if (!page) {
4831                 r = -ENOMEM;
4832                 goto fail;
4833         }
4834         vcpu->arch.pio_data = page_address(page);
4835
4836         r = kvm_mmu_create(vcpu);
4837         if (r < 0)
4838                 goto fail_free_pio_data;
4839
4840         if (irqchip_in_kernel(kvm)) {
4841                 r = kvm_create_lapic(vcpu);
4842                 if (r < 0)
4843                         goto fail_mmu_destroy;
4844         }
4845
4846         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4847                                        GFP_KERNEL);
4848         if (!vcpu->arch.mce_banks) {
4849                 r = -ENOMEM;
4850                 goto fail_mmu_destroy;
4851         }
4852         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4853
4854         return 0;
4855
4856 fail_mmu_destroy:
4857         kvm_mmu_destroy(vcpu);
4858 fail_free_pio_data:
4859         free_page((unsigned long)vcpu->arch.pio_data);
4860 fail:
4861         return r;
4862 }
4863
4864 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4865 {
4866         kvm_free_lapic(vcpu);
4867         down_read(&vcpu->kvm->slots_lock);
4868         kvm_mmu_destroy(vcpu);
4869         up_read(&vcpu->kvm->slots_lock);
4870         free_page((unsigned long)vcpu->arch.pio_data);
4871 }
4872
4873 struct  kvm *kvm_arch_create_vm(void)
4874 {
4875         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4876
4877         if (!kvm)
4878                 return ERR_PTR(-ENOMEM);
4879
4880         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4881         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4882
4883         /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4884         set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4885
4886         rdtscll(kvm->arch.vm_init_tsc);
4887
4888         return kvm;
4889 }
4890
4891 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4892 {
4893         vcpu_load(vcpu);
4894         kvm_mmu_unload(vcpu);
4895         vcpu_put(vcpu);
4896 }
4897
4898 static void kvm_free_vcpus(struct kvm *kvm)
4899 {
4900         unsigned int i;
4901         struct kvm_vcpu *vcpu;
4902
4903         /*
4904          * Unpin any mmu pages first.
4905          */
4906         kvm_for_each_vcpu(i, vcpu, kvm)
4907                 kvm_unload_vcpu_mmu(vcpu);
4908         kvm_for_each_vcpu(i, vcpu, kvm)
4909                 kvm_arch_vcpu_free(vcpu);
4910
4911         mutex_lock(&kvm->lock);
4912         for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
4913                 kvm->vcpus[i] = NULL;
4914
4915         atomic_set(&kvm->online_vcpus, 0);
4916         mutex_unlock(&kvm->lock);
4917 }
4918
4919 void kvm_arch_sync_events(struct kvm *kvm)
4920 {
4921         kvm_free_all_assigned_devices(kvm);
4922 }
4923
4924 void kvm_arch_destroy_vm(struct kvm *kvm)
4925 {
4926         kvm_iommu_unmap_guest(kvm);
4927         kvm_free_pit(kvm);
4928         kfree(kvm->arch.vpic);
4929         kfree(kvm->arch.vioapic);
4930         kvm_free_vcpus(kvm);
4931         kvm_free_physmem(kvm);
4932         if (kvm->arch.apic_access_page)
4933                 put_page(kvm->arch.apic_access_page);
4934         if (kvm->arch.ept_identity_pagetable)
4935                 put_page(kvm->arch.ept_identity_pagetable);
4936         kfree(kvm);
4937 }
4938
4939 int kvm_arch_set_memory_region(struct kvm *kvm,
4940                                 struct kvm_userspace_memory_region *mem,
4941                                 struct kvm_memory_slot old,
4942                                 int user_alloc)
4943 {
4944         int npages = mem->memory_size >> PAGE_SHIFT;
4945         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4946
4947         /*To keep backward compatibility with older userspace,
4948          *x86 needs to hanlde !user_alloc case.
4949          */
4950         if (!user_alloc) {
4951                 if (npages && !old.rmap) {
4952                         unsigned long userspace_addr;
4953
4954                         down_write(&current->mm->mmap_sem);
4955                         userspace_addr = do_mmap(NULL, 0,
4956                                                  npages * PAGE_SIZE,
4957                                                  PROT_READ | PROT_WRITE,
4958                                                  MAP_PRIVATE | MAP_ANONYMOUS,
4959                                                  0);
4960                         up_write(&current->mm->mmap_sem);
4961
4962                         if (IS_ERR((void *)userspace_addr))
4963                                 return PTR_ERR((void *)userspace_addr);
4964
4965                         /* set userspace_addr atomically for kvm_hva_to_rmapp */
4966                         spin_lock(&kvm->mmu_lock);
4967                         memslot->userspace_addr = userspace_addr;
4968                         spin_unlock(&kvm->mmu_lock);
4969                 } else {
4970                         if (!old.user_alloc && old.rmap) {
4971                                 int ret;
4972
4973                                 down_write(&current->mm->mmap_sem);
4974                                 ret = do_munmap(current->mm, old.userspace_addr,
4975                                                 old.npages * PAGE_SIZE);
4976                                 up_write(&current->mm->mmap_sem);
4977                                 if (ret < 0)
4978                                         printk(KERN_WARNING
4979                                        "kvm_vm_ioctl_set_memory_region: "
4980                                        "failed to munmap memory\n");
4981                         }
4982                 }
4983         }
4984
4985         spin_lock(&kvm->mmu_lock);
4986         if (!kvm->arch.n_requested_mmu_pages) {
4987                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4988                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4989         }
4990
4991         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4992         spin_unlock(&kvm->mmu_lock);
4993
4994         return 0;
4995 }
4996
4997 void kvm_arch_flush_shadow(struct kvm *kvm)
4998 {
4999         kvm_mmu_zap_all(kvm);
5000         kvm_reload_remote_mmus(kvm);
5001 }
5002
5003 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
5004 {
5005         return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
5006                 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
5007                 || vcpu->arch.nmi_pending ||
5008                 (kvm_arch_interrupt_allowed(vcpu) &&
5009                  kvm_cpu_has_interrupt(vcpu));
5010 }
5011
5012 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5013 {
5014         int me;
5015         int cpu = vcpu->cpu;
5016
5017         if (waitqueue_active(&vcpu->wq)) {
5018                 wake_up_interruptible(&vcpu->wq);
5019                 ++vcpu->stat.halt_wakeup;
5020         }
5021
5022         me = get_cpu();
5023         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5024                 if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
5025                         smp_send_reschedule(cpu);
5026         put_cpu();
5027 }
5028
5029 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5030 {
5031         return kvm_x86_ops->interrupt_allowed(vcpu);
5032 }
5033
5034 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
5035 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
5036 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
5037 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
5038 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5039 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5040 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5041 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5042 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5043 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5044 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);