Merge branch 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Linus Torvalds [Tue, 15 Sep 2009 00:43:43 +0000 (17:43 -0700)]
* 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (202 commits)
  MAINTAINERS: update KVM entry
  KVM: correct error-handling code
  KVM: fix compile warnings on s390
  KVM: VMX: Check cpl before emulating debug register access
  KVM: fix misreporting of coalesced interrupts by kvm tracer
  KVM: x86: drop duplicate kvm_flush_remote_tlb calls
  KVM: VMX: call vmx_load_host_state() only if msr is cached
  KVM: VMX: Conditionally reload debug register 6
  KVM: Use thread debug register storage instead of kvm specific data
  KVM guest: do not batch pte updates from interrupt context
  KVM: Fix coalesced interrupt reporting in IOAPIC
  KVM guest: fix bogus wallclock physical address calculation
  KVM: VMX: Fix cr8 exiting control clobbering by EPT
  KVM: Optimize kvm_mmu_unprotect_page_virt() for tdp
  KVM: Document KVM_CAP_IRQCHIP
  KVM: Protect update_cr8_intercept() when running without an apic
  KVM: VMX: Fix EPT with WP bit change during paging
  KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors
  KVM: x86 emulator: Add adc and sbb missing decoder flags
  KVM: Add missing #include
  ...

1  2 
Documentation/ioctl/ioctl-number.txt
Documentation/kernel-parameters.txt
MAINTAINERS
arch/s390/include/asm/kvm_host.h
arch/x86/include/asm/apicdef.h
arch/x86/kvm/x86.c
arch/x86/mm/highmem_32.c

@@@ -121,7 -121,6 +121,7 @@@ Code       Seq#    Include File            Comment
  'c'   00-7F   linux/comstats.h        conflict!
  'c'   00-7F   linux/coda.h            conflict!
  'c'   80-9F   arch/s390/include/asm/chsc.h
 +'c'   A0-AF   arch/x86/include/asm/msr.h
  'd'   00-FF   linux/char/drm/drm/h    conflict!
  'd'   F0-FF   linux/digi1.h
  'e'   all     linux/digi1.h           conflict!
  0xAD  00      Netfilter device        in development:
                                        <mailto:rusty@rustcorp.com.au>  
  0xAE  all     linux/kvm.h             Kernel-based Virtual Machine
-                                       <mailto:kvm-devel@lists.sourceforge.net>
+                                       <mailto:kvm@vger.kernel.org>
  0xB0  all     RATIO devices           in development:
                                        <mailto:vgo@ratio.de>
  0xB1  00-1F   PPPoX                   <mailto:mostrows@styx.uwaterloo.ca>
@@@ -57,6 -57,7 +57,7 @@@ parameter is applicable
        ISAPNP  ISA PnP code is enabled.
        ISDN    Appropriate ISDN support is enabled.
        JOY     Appropriate joystick support is enabled.
+       KVM     Kernel Virtual Machine support is enabled.
        LIBATA  Libata driver is enabled
        LP      Printer support is enabled.
        LOOP    Loopback device support is enabled.
@@@ -1098,6 -1099,44 +1099,44 @@@ and is between 256 and 4096 characters
        kstack=N        [X86] Print N words from the kernel stack
                        in oops dumps.
  
+       kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
+                       Default is 0 (don't ignore, but inject #GP)
+       kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
+                       Default is 1 (enabled)
+       kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
+                       Default is 0 (off)
+       kvm-amd.npt=    [KVM,AMD] Disable nested paging (virtualized MMU)
+                       for all guests.
+                       Default is 1 (enabled) if in 64bit or 32bit-PAE mode
+       kvm-intel.bypass_guest_pf=
+                       [KVM,Intel] Disables bypassing of guest page faults
+                       on Intel chips. Default is 1 (enabled)
+       kvm-intel.ept=  [KVM,Intel] Disable extended page tables
+                       (virtualized MMU) support on capable Intel chips.
+                       Default is 1 (enabled)
+       kvm-intel.emulate_invalid_guest_state=
+                       [KVM,Intel] Enable emulation of invalid guest states
+                       Default is 0 (disabled)
+       kvm-intel.flexpriority=
+                       [KVM,Intel] Disable FlexPriority feature (TPR shadow).
+                       Default is 1 (enabled)
+       kvm-intel.unrestricted_guest=
+                       [KVM,Intel] Disable unrestricted guest feature
+                       (virtualized real and unpaged mode) on capable
+                       Intel chips. Default is 1 (enabled)
+       kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
+                       feature (tagged TLBs) on capable Intel chips.
+                       Default is 1 (enabled)
        l2cr=           [PPC]
  
        l3cr=           [PPC]
                        [NFS] set the TCP port on which the NFSv4 callback
                        channel should listen.
  
 +      nfs.cache_getent=
 +                      [NFS] sets the pathname to the program which is used
 +                      to update the NFS client cache entries.
 +
 +      nfs.cache_getent_timeout=
 +                      [NFS] sets the timeout after which an attempt to
 +                      update a cache entry is deemed to have failed.
 +
        nfs.idmap_cache_timeout=
                        [NFS] set the maximum lifetime for idmapper cache
                        entries.
                        symbolic names: lapic and ioapic
                        Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
  
 +      netpoll.carrier_timeout=
 +                      [NET] Specifies amount of time (in seconds) that
 +                      netpoll should wait for a carrier. By default netpoll
 +                      waits 4 seconds.
 +
        no387           [BUGS=X86-32] Tells the kernel to use the 387 maths
                        emulation library even if a 387 maths coprocessor
                        is present.
        stifb=          [HW]
                        Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
  
 +      sunrpc.min_resvport=
 +      sunrpc.max_resvport=
 +                      [NFS,SUNRPC]
 +                      SunRPC servers often require that client requests
 +                      originate from a privileged port (i.e. a port in the
 +                      range 0 < portnr < 1024).
 +                      An administrator who wishes to reserve some of these
 +                      ports for other uses may adjust the range that the
 +                      kernel's sunrpc client considers to be privileged
 +                      using these two parameters to set the minimum and
 +                      maximum port values.
 +
        sunrpc.pool_mode=
                        [NFS]
                        Control how the NFS server code allocates CPUs to
                        pernode     one pool for each NUMA node (equivalent
                                    to global on non-NUMA machines)
  
 +      sunrpc.tcp_slot_table_entries=
 +      sunrpc.udp_slot_table_entries=
 +                      [NFS,SUNRPC]
 +                      Sets the upper limit on the number of simultaneous
 +                      RPC calls that can be sent from the client to a
 +                      server. Increasing these values may allow you to
 +                      improve throughput, but will also increase the
 +                      amount of memory reserved for use by the client.
 +
        swiotlb=        [IA-64] Number of I/O TLB slabs
  
        switches=       [HW,M68k]
        trace_buf_size=nn[KMG]
                        [FTRACE] will set tracing buffer size.
  
 +      trace_event=[event-list]
 +                      [FTRACE] Set and start specified trace events in order
 +                      to facilitate early boot debugging.
 +                      See also Documentation/trace/events.txt
 +
        trix=           [HW,OSS] MediaTrix AudioTrix Pro
                        Format:
                        <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --combined MAINTAINERS
@@@ -439,7 -439,7 +439,7 @@@ F: drivers/hwmon/ams
  AMSO1100 RNIC DRIVER
  M:    Tom Tucker <tom@opengridcomputing.com>
  M:    Steve Wise <swise@opengridcomputing.com>
 -L:    general@lists.openfabrics.org
 +L:    linux-rdma@vger.kernel.org
  S:    Maintained
  F:    drivers/infiniband/hw/amso1100/
  
@@@ -876,7 -876,6 +876,7 @@@ M: "Luis R. Rodriguez" <lrodriguez@athe
  M:    Bob Copeland <me@bobcopeland.com>
  L:    linux-wireless@vger.kernel.org
  L:    ath5k-devel@lists.ath5k.org
 +W:    http://wireless.kernel.org/en/users/Drivers/ath5k
  S:    Maintained
  F:    drivers/net/wireless/ath/ath5k/
  
@@@ -888,7 -887,6 +888,7 @@@ M: Vasanthakumar Thiagarajan <vasanth@a
  M:    Senthil Balasubramanian <senthilkumar@atheros.com>
  L:    linux-wireless@vger.kernel.org
  L:    ath9k-devel@lists.ath9k.org
 +W:    http://wireless.kernel.org/en/users/Drivers/ath9k
  S:    Supported
  F:    drivers/net/wireless/ath/ath9k/
  
@@@ -1496,7 -1494,7 +1496,7 @@@ F:      drivers/net/cxgb3
  
  CXGB3 IWARP RNIC DRIVER (IW_CXGB3)
  M:    Steve Wise <swise@chelsio.com>
 -L:    general@lists.openfabrics.org
 +L:    linux-rdma@vger.kernel.org
  W:    http://www.openfabrics.org
  S:    Supported
  F:    drivers/infiniband/hw/cxgb3/
@@@ -1870,7 -1868,7 +1870,7 @@@ F:      fs/efs
  EHCA (IBM GX bus InfiniBand adapter) DRIVER
  M:    Hoang-Nam Nguyen <hnguyen@de.ibm.com>
  M:    Christoph Raisch <raisch@de.ibm.com>
 -L:    general@lists.openfabrics.org
 +L:    linux-rdma@vger.kernel.org
  S:    Supported
  F:    drivers/infiniband/hw/ehca/
  
@@@ -2554,7 -2552,7 +2554,7 @@@ INFINIBAND SUBSYSTE
  M:    Roland Dreier <rolandd@cisco.com>
  M:    Sean Hefty <sean.hefty@intel.com>
  M:    Hal Rosenstock <hal.rosenstock@gmail.com>
 -L:    general@lists.openfabrics.org (moderated for non-subscribers)
 +L:    linux-rdma@vger.kernel.org
  W:    http://www.openib.org/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git
  S:    Supported
@@@ -2662,21 -2660,25 +2662,21 @@@ F:   drivers/net/ixgbe
  
  INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT
  M:    Zhu Yi <yi.zhu@intel.com>
 -M:    James Ketrenos <jketreno@linux.intel.com>
  M:    Reinette Chatre <reinette.chatre@intel.com>
 +M:    Intel Linux Wireless <ilw@linux.intel.com>
  L:    linux-wireless@vger.kernel.org
 -L:    ipw2100-devel@lists.sourceforge.net
 -W:    http://lists.sourceforge.net/mailman/listinfo/ipw2100-devel
  W:    http://ipw2100.sourceforge.net
 -S:    Supported
 +S:    Odd Fixes
  F:    Documentation/networking/README.ipw2100
  F:    drivers/net/wireless/ipw2x00/ipw2100.*
  
  INTEL PRO/WIRELESS 2915ABG NETWORK CONNECTION SUPPORT
  M:    Zhu Yi <yi.zhu@intel.com>
 -M:    James Ketrenos <jketreno@linux.intel.com>
  M:    Reinette Chatre <reinette.chatre@intel.com>
 +M:    Intel Linux Wireless <ilw@linux.intel.com>
  L:    linux-wireless@vger.kernel.org
 -L:    ipw2100-devel@lists.sourceforge.net
 -W:    http://lists.sourceforge.net/mailman/listinfo/ipw2100-devel
  W:    http://ipw2200.sourceforge.net
 -S:    Supported
 +S:    Odd Fixes
  F:    Documentation/networking/README.ipw2200
  F:    drivers/net/wireless/ipw2x00/ipw2200.*
  
@@@ -2693,8 -2695,8 +2693,8 @@@ F:      include/linux/wimax/i2400m.
  INTEL WIRELESS WIFI LINK (iwlwifi)
  M:    Zhu Yi <yi.zhu@intel.com>
  M:    Reinette Chatre <reinette.chatre@intel.com>
 +M:    Intel Linux Wireless <ilw@linux.intel.com>
  L:    linux-wireless@vger.kernel.org
 -L:    ipw3945-devel@lists.sourceforge.net
  W:    http://intellinuxwireless.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi-2.6.git
  S:    Supported
@@@ -2727,7 -2729,7 +2727,7 @@@ F:      drivers/net/ipg.
  
  IPATH DRIVER
  M:    Ralph Campbell <infinipath@qlogic.com>
 -L:    general@lists.openfabrics.org
 +L:    linux-rdma@vger.kernel.org
  T:    git git://git.qlogic.com/ipath-linux-2.6
  S:    Supported
  F:    drivers/infiniband/hw/ipath/
@@@ -2926,6 -2928,7 +2926,7 @@@ F:      include/linux/sunrpc
  
  KERNEL VIRTUAL MACHINE (KVM)
  M:    Avi Kivity <avi@redhat.com>
+ M:    Marcelo Tosatti <mtosatti@redhat.com>
  L:    kvm@vger.kernel.org
  W:    http://kvm.qumranet.com
  S:    Supported
@@@ -3277,12 -3280,6 +3278,12 @@@ S:    Supporte
  F:    drivers/net/mv643xx_eth.*
  F:    include/linux/mv643xx.h
  
 +MARVELL MWL8K WIRELESS DRIVER
 +M:    Lennert Buytenhek <buytenh@marvell.com>
 +L:    linux-wireless@vger.kernel.org
 +S:    Supported
 +F:    drivers/net/wireless/mwl8k.c
 +
  MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER
  M:    Nicolas Pitre <nico@cam.org>
  S:    Maintained
@@@ -3489,7 -3486,7 +3490,7 @@@ F:      drivers/scsi/NCR_D700.
  NETEFFECT IWARP RNIC DRIVER (IW_NES)
  M:    Faisal Latif <faisal.latif@intel.com>
  M:    Chien Tung <chien.tin.tung@intel.com>
 -L:    general@lists.openfabrics.org
 +L:    linux-rdma@vger.kernel.org
  W:    http://www.neteffect.com
  S:    Supported
  F:    drivers/infiniband/hw/nes/
@@@ -3595,12 -3592,9 +3596,12 @@@ M:    "John W. Linville" <linville@tuxdriv
  L:    linux-wireless@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git
  S:    Maintained
 +F:    net/mac80211/
 +F:    net/rfkill/
  F:    net/wireless/
  F:    include/net/ieee80211*
  F:    include/linux/wireless.h
 +F:    drivers/net/wireless/
  
  NETWORKING DRIVERS
  L:    netdev@vger.kernel.org
@@@ -4306,7 -4300,7 +4307,7 @@@ L:      linux-wireless@vger.kernel.or
  W:    http://linuxwireless.org/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
  S:    Maintained
 -F:    drivers/net/wireless/rtl818*
 +F:    drivers/net/wireless/rtl818x/rtl8180*
  
  RTL8187 WIRELESS DRIVER
  M:    Herton Ronaldo Krzesinski <herton@mandriva.com.br>
@@@ -4533,10 -4527,9 +4534,10 @@@ S:    Supporte
  F:    drivers/net/benet/
  
  SFC NETWORK DRIVER
 -P:    Steve Hodgson
 -P:    Ben Hutchings
 -M:    Robert Stonehouse <linux-net-drivers@solarflare.com>
 +M:    Solarflare linux maintainers <linux-net-drivers@solarflare.com>
 +M:    Steve Hodgson <shodgson@solarflare.com>
 +M:    Ben Hutchings <bhutchings@solarflare.com>
 +L:    netdev@vger.kernel.org
  S:    Supported
  F:    drivers/net/sfc/
  
@@@ -5586,24 -5579,6 +5587,24 @@@ M:    Miloslav Trmac <mitr@volny.cz
  S:    Maintained
  F:    drivers/input/misc/wistron_btns.c
  
 +WL1251 WIRELESS DRIVER
 +P:    Kalle Valo
 +M:    kalle.valo@nokia.com
 +L:    linux-wireless@vger.kernel.org
 +W:    http://wireless.kernel.org
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
 +S:    Maintained
 +F:    drivers/net/wireless/wl12xx/*
 +X:    drivers/net/wireless/wl12xx/wl1271*
 +
 +WL1271 WIRELESS DRIVER
 +M:    Luciano Coelho <luciano.coelho@nokia.com>
 +L:    linux-wireless@vger.kernel.org
 +W:    http://wireless.kernel.org
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-testing.git
 +S:    Maintained
 +F:    drivers/net/wireless/wl12xx/wl1271*
 +
  WL3501 WIRELESS PCMCIA CARD DRIVER
  M:    Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  L:    linux-wireless@vger.kernel.org
@@@ -1,7 -1,7 +1,7 @@@
  /*
   * asm-s390/kvm_host.h - definition for kernel virtual machines on s390
   *
-  * Copyright IBM Corp. 2008
+  * Copyright IBM Corp. 2008,2009
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License (version 2 only)
@@@ -17,7 -17,7 +17,7 @@@
  #include <linux/interrupt.h>
  #include <linux/kvm_host.h>
  #include <asm/debug.h>
 -#include <asm/cpuid.h>
 +#include <asm/cpu.h>
  
  #define KVM_MAX_VCPUS 64
  #define KVM_MEMORY_SLOTS 32
@@@ -40,7 -40,11 +40,11 @@@ struct sca_block 
        struct sca_entry cpu[64];
  } __attribute__((packed));
  
- #define KVM_PAGES_PER_HPAGE 256
+ #define KVM_NR_PAGE_SIZES 2
+ #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+ #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+ #define KVM_HPAGE_MASK(x)     (~(KVM_HPAGE_SIZE(x) - 1))
+ #define KVM_PAGES_PER_HPAGE(x)        (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
  
  #define CPUSTAT_HOST       0x80000000
  #define CPUSTAT_WAIT       0x10000000
@@@ -182,8 -186,9 +186,9 @@@ struct kvm_s390_interrupt_info 
  };
  
  /* for local_interrupt.action_flags */
- #define ACTION_STORE_ON_STOP 1
- #define ACTION_STOP_ON_STOP  2
+ #define ACTION_STORE_ON_STOP          (1<<0)
+ #define ACTION_STOP_ON_STOP           (1<<1)
+ #define ACTION_RELOADVCPU_ON_STOP     (1<<2)
  
  struct kvm_s390_local_interrupt {
        spinlock_t lock;
@@@ -217,8 -222,8 +222,8 @@@ struct kvm_vcpu_arch 
        struct hrtimer    ckc_timer;
        struct tasklet_struct tasklet;
        union  {
 -              cpuid_t   cpu_id;
 -              u64       stidp_data;
 +              struct cpuid    cpu_id;
 +              u64             stidp_data;
        };
  };
  
@@@ -227,8 -232,6 +232,6 @@@ struct kvm_vm_stat 
  };
  
  struct kvm_arch{
-       unsigned long guest_origin;
-       unsigned long guest_memsize;
        struct sca_block *sca;
        debug_info_t *dbf;
        struct kvm_s390_float_interrupt float_int;
@@@ -8,13 -8,13 +8,14 @@@
   * Ingo Molnar <mingo@redhat.com>, 1999, 2000
   */
  
 -#define       APIC_DEFAULT_PHYS_BASE  0xfee00000
 +#define IO_APIC_DEFAULT_PHYS_BASE     0xfec00000
 +#define       APIC_DEFAULT_PHYS_BASE          0xfee00000
  
  #define       APIC_ID         0x20
  
  #define       APIC_LVR        0x30
  #define               APIC_LVR_MASK           0xFF00FF
+ #define               APIC_LVR_DIRECTED_EOI   (1 << 24)
  #define               GET_APIC_VERSION(x)     ((x) & 0xFFu)
  #define               GET_APIC_MAXLVT(x)      (((x) >> 16) & 0xFFu)
  #ifdef CONFIG_X86_32
@@@ -41,6 -41,7 +42,7 @@@
  #define               APIC_DFR_CLUSTER                0x0FFFFFFFul
  #define               APIC_DFR_FLAT                   0xFFFFFFFFul
  #define       APIC_SPIV       0xF0
+ #define               APIC_SPIV_DIRECTED_EOI          (1 << 12)
  #define               APIC_SPIV_FOCUS_DISABLED        (1 << 9)
  #define               APIC_SPIV_APIC_ENABLED          (1 << 8)
  #define       APIC_ISR        0x100
diff --combined arch/x86/kvm/x86.c
  #include <linux/iommu.h>
  #include <linux/intel-iommu.h>
  #include <linux/cpufreq.h>
+ #include <trace/events/kvm.h>
+ #undef TRACE_INCLUDE_FILE
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
  
  #include <asm/uaccess.h>
  #include <asm/msr.h>
  #include <asm/desc.h>
  #include <asm/mtrr.h>
+ #include <asm/mce.h>
  
  #define MAX_IO_MSRS 256
  #define CR0_RESERVED_BITS                                             \
                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
  
  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+ #define KVM_MAX_MCE_BANKS 32
+ #define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
  /* EFER defaults:
   * - enable syscall per default because its emulated by KVM
   * - enable LME and LMA per default on 64 bit KVM
@@@ -68,14 -77,16 +77,16 @@@ static u64 __read_mostly efer_reserved_
  #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  
+ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                    struct kvm_cpuid_entry2 __user *entries);
- struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function, u32 index);
  
  struct kvm_x86_ops *kvm_x86_ops;
  EXPORT_SYMBOL_GPL(kvm_x86_ops);
  
+ int ignore_msrs = 0;
+ module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
  struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
        { "pf_guest", VCPU_STAT(pf_guest) },
@@@ -122,18 -133,16 +133,16 @@@ unsigned long segment_base(u16 selector
        if (selector == 0)
                return 0;
  
-       asm("sgdt %0" : "=m"(gdt));
+       kvm_get_gdt(&gdt);
        table_base = gdt.base;
  
        if (selector & 4) {           /* from ldt */
-               u16 ldt_selector;
+               u16 ldt_selector = kvm_read_ldt();
  
                table_base = segment_base(ldt_selector);
        }
        d = (struct desc_struct *)(table_base + (selector & ~7));
-       v = d->base0 | ((unsigned long)d->base1 << 16) |
-               ((unsigned long)d->base2 << 24);
+       v = get_desc_base(d);
  #ifdef CONFIG_X86_64
        if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
                v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@@ -176,16 -185,22 +185,22 @@@ void kvm_inject_page_fault(struct kvm_v
        ++vcpu->stat.pf_guest;
  
        if (vcpu->arch.exception.pending) {
-               if (vcpu->arch.exception.nr == PF_VECTOR) {
-                       printk(KERN_DEBUG "kvm: inject_page_fault:"
-                                       " double fault 0x%lx\n", addr);
-                       vcpu->arch.exception.nr = DF_VECTOR;
-                       vcpu->arch.exception.error_code = 0;
-               } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+               switch(vcpu->arch.exception.nr) {
+               case DF_VECTOR:
                        /* triple fault -> shutdown */
                        set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return;
+               case PF_VECTOR:
+                       vcpu->arch.exception.nr = DF_VECTOR;
+                       vcpu->arch.exception.error_code = 0;
+                       return;
+               default:
+                       /* replace previous exception with a new one in a hope
+                          that instruction re-execution will regenerate lost
+                          exception */
+                       vcpu->arch.exception.pending = false;
+                       break;
                }
-               return;
        }
        vcpu->arch.cr2 = addr;
        kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@@ -207,12 -222,18 +222,18 @@@ void kvm_queue_exception_e(struct kvm_v
  }
  EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
  
- static void __queue_exception(struct kvm_vcpu *vcpu)
+ /*
+  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
+  * a #GP and return false.
+  */
+ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
  {
-       kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
-                                    vcpu->arch.exception.has_error_code,
-                                    vcpu->arch.exception.error_code);
+       if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+               return true;
+       kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+       return false;
  }
+ EXPORT_SYMBOL_GPL(kvm_require_cpl);
  
  /*
   * Load the pae pdptrs.  Return true is they are all valid.
@@@ -232,7 -253,7 +253,7 @@@ int load_pdptrs(struct kvm_vcpu *vcpu, 
                goto out;
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-               if (is_present_pte(pdpte[i]) &&
+               if (is_present_gpte(pdpte[i]) &&
                    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
                        ret = 0;
                        goto out;
        ret = 1;
  
        memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_avail);
+       __set_bit(VCPU_EXREG_PDPTR,
+                 (unsigned long *)&vcpu->arch.regs_dirty);
  out:
  
        return ret;
@@@ -256,6 -281,10 +281,10 @@@ static bool pdptrs_changed(struct kvm_v
        if (is_long_mode(vcpu) || !is_pae(vcpu))
                return false;
  
+       if (!test_bit(VCPU_EXREG_PDPTR,
+                     (unsigned long *)&vcpu->arch.regs_avail))
+               return true;
        r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
        if (r < 0)
                goto out;
@@@ -328,9 -357,6 +357,6 @@@ EXPORT_SYMBOL_GPL(kvm_set_cr0)
  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
  {
        kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
-       KVMTRACE_1D(LMSW, vcpu,
-                   (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
-                   handler);
  }
  EXPORT_SYMBOL_GPL(kvm_lmsw);
  
@@@ -466,7 -492,7 +492,7 @@@ static u32 msrs_to_save[] = 
  #ifdef CONFIG_X86_64
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
-       MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+       MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
        MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
  };
  
@@@ -644,8 -670,7 +670,7 @@@ static void kvm_write_guest_time(struc
  
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-       kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
-                         &vcpu->hv_clock.tsc_timestamp);
+       kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
        ktime_get_ts(&ts);
        local_irq_restore(flags);
  
@@@ -778,23 -803,60 +803,60 @@@ static int set_msr_mtrr(struct kvm_vcp
        return 0;
  }
  
+ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+ {
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+       switch (msr) {
+       case MSR_IA32_MCG_STATUS:
+               vcpu->arch.mcg_status = data;
+               break;
+       case MSR_IA32_MCG_CTL:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               if (data != 0 && data != ~(u64)0)
+                       return -1;
+               vcpu->arch.mcg_ctl = data;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL */
+                       if ((offset & 0x3) == 0 &&
+                           data != 0 && data != ~(u64)0)
+                               return -1;
+                       vcpu->arch.mce_banks[offset] = data;
+                       break;
+               }
+               return 1;
+       }
+       return 0;
+ }
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
        switch (msr) {
        case MSR_EFER:
                set_efer(vcpu, data);
                break;
-       case MSR_IA32_MC0_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
-                      __func__, data);
+       case MSR_K7_HWCR:
+               data &= ~(u64)0x40;     /* ignore flush filter disable */
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+                               data);
+                       return 1;
+               }
                break;
-       case MSR_IA32_MCG_STATUS:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
-                       __func__, data);
+       case MSR_FAM10H_MMIO_CONF_BASE:
+               if (data != 0) {
+                       pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+                               "0x%llx\n", data);
+                       return 1;
+               }
                break;
-       case MSR_IA32_MCG_CTL:
-               pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
-                       __func__, data);
+       case MSR_AMD64_NB_CFG:
                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
        case MSR_VM_HSAVE_PA:
+       case MSR_AMD64_PATCH_LOADER:
                break;
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
                kvm_set_apic_base(vcpu, data);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_write(vcpu, msr, data);
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
                kvm_request_guest_time_update(vcpu);
                break;
        }
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return set_msr_mce(vcpu, msr, data);
+       /* Performance counters are not protected by a CPUID bit,
+        * so we should check all of them in the generic path for the sake of
+        * cross vendor migration.
+        * Writing a zero into the event select MSRs disables them,
+        * which we perfectly emulate ;-). Any other value should be at least
+        * reported, some guests depend on them.
+        */
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               if (data != 0)
+                       pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                               "0x%x data 0x%llx\n", msr, data);
+               break;
+       /* at least RHEL 4 unconditionally writes to the perfctr registers,
+        * so we ignore writes to make it happy.
+        */
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+                       "0x%x data 0x%llx\n", msr, data);
+               break;
        default:
-               pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+                               msr, data);
+                       break;
+               }
        }
        return 0;
  }
@@@ -905,26 -1011,47 +1011,47 @@@ static int get_msr_mtrr(struct kvm_vcp
        return 0;
  }
  
- int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
  {
        u64 data;
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
  
        switch (msr) {
-       case 0xc0010010: /* SYSCFG */
-       case 0xc0010015: /* HWCR */
-       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_P5_MC_ADDR:
        case MSR_IA32_P5_MC_TYPE:
-       case MSR_IA32_MC0_CTL:
-       case MSR_IA32_MCG_STATUS:
+               data = 0;
+               break;
        case MSR_IA32_MCG_CAP:
+               data = vcpu->arch.mcg_cap;
+               break;
        case MSR_IA32_MCG_CTL:
-       case MSR_IA32_MC0_MISC:
-       case MSR_IA32_MC0_MISC+4:
-       case MSR_IA32_MC0_MISC+8:
-       case MSR_IA32_MC0_MISC+12:
-       case MSR_IA32_MC0_MISC+16:
-       case MSR_IA32_MC0_MISC+20:
+               if (!(mcg_cap & MCG_CTL_P))
+                       return 1;
+               data = vcpu->arch.mcg_ctl;
+               break;
+       case MSR_IA32_MCG_STATUS:
+               data = vcpu->arch.mcg_status;
+               break;
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       data = vcpu->arch.mce_banks[offset];
+                       break;
+               }
+               return 1;
+       }
+       *pdata = data;
+       return 0;
+ }
+ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+ {
+       u64 data;
+       switch (msr) {
+       case MSR_IA32_PLATFORM_ID:
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_EBL_CR_POWERON:
        case MSR_IA32_DEBUGCTLMSR:
        case MSR_IA32_LASTBRANCHTOIP:
        case MSR_IA32_LASTINTFROMIP:
        case MSR_IA32_LASTINTTOIP:
+       case MSR_K8_SYSCFG:
+       case MSR_K7_HWCR:
        case MSR_VM_HSAVE_PA:
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
        case MSR_P6_EVNTSEL0:
        case MSR_P6_EVNTSEL1:
        case MSR_K7_EVNTSEL0:
+       case MSR_K7_PERFCTR0:
+       case MSR_K8_INT_PENDING_MSG:
+       case MSR_AMD64_NB_CFG:
+       case MSR_FAM10H_MMIO_CONF_BASE:
                data = 0;
                break;
        case MSR_MTRRcap:
        case MSR_IA32_APICBASE:
                data = kvm_get_apic_base(vcpu);
                break;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+               return kvm_x2apic_msr_read(vcpu, msr, pdata);
+               break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
        case MSR_KVM_SYSTEM_TIME:
                data = vcpu->arch.time;
                break;
+       case MSR_IA32_P5_MC_ADDR:
+       case MSR_IA32_P5_MC_TYPE:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+               return get_msr_mce(vcpu, msr, pdata);
        default:
-               pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
-               return 1;
+               if (!ignore_msrs) {
+                       pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+                       return 1;
+               } else {
+                       pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+                       data = 0;
+               }
+               break;
        }
        *pdata = data;
        return 0;
@@@ -1068,6 -1219,11 +1219,11 @@@ int kvm_dev_ioctl_check_extension(long 
        case KVM_CAP_REINJECT_CONTROL:
        case KVM_CAP_IRQ_INJECT_STATUS:
        case KVM_CAP_ASSIGN_DEV_IRQ:
+       case KVM_CAP_IRQFD:
+       case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_PIT2:
+       case KVM_CAP_PIT_STATE2:
+       case KVM_CAP_SET_IDENTITY_MAP_ADDR:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
        case KVM_CAP_IOMMU:
                r = iommu_found();
                break;
+       case KVM_CAP_MCE:
+               r = KVM_MAX_MCE_BANKS;
+               break;
        default:
                r = 0;
                break;
@@@ -1147,6 -1306,16 +1306,16 @@@ long kvm_arch_dev_ioctl(struct file *fi
                r = 0;
                break;
        }
+       case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+               u64 mce_cap;
+               mce_cap = KVM_MCE_CAP_SUPPORTED;
+               r = -EFAULT;
+               if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+                       goto out;
+               r = 0;
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@@ -1227,6 -1396,7 +1396,7 @@@ static int kvm_vcpu_ioctl_set_cpuid(str
        vcpu->arch.cpuid_nent = cpuid->nent;
        cpuid_fix_nx_cap(vcpu);
        r = 0;
+       kvm_apic_set_version(vcpu);
  
  out_free:
        vfree(cpuid_entries);
@@@ -1248,6 -1418,7 +1418,7 @@@ static int kvm_vcpu_ioctl_set_cpuid2(st
                           cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                goto out;
        vcpu->arch.cpuid_nent = cpuid->nent;
+       kvm_apic_set_version(vcpu);
        return 0;
  
  out:
@@@ -1290,6 -1461,7 +1461,7 @@@ static void do_cpuid_ent(struct kvm_cpu
                         u32 index, int *nent, int maxnent)
  {
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+       unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
  #ifdef CONFIG_X86_64
        unsigned f_lm = F(LM);
  #else
                F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
                F(PAT) | F(PSE36) | 0 /* Reserved */ |
                f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-               F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+               F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
                0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
        /* cpuid 1.ecx */
        const u32 kvm_supported_word4_x86_features =
                0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
                0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
                0 /* Reserved, DCA */ | F(XMM4_1) |
-               F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+               F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
                0 /* Reserved, XSAVE, OSXSAVE */;
        /* cpuid 0x80000001.ecx */
        const u32 kvm_supported_word6_x86_features =
        case 1:
                entry->edx &= kvm_supported_word0_x86_features;
                entry->ecx &= kvm_supported_word4_x86_features;
+               /* we support x2apic emulation even if host does not support
+                * it since we emulate x2apic in software */
+               entry->ecx |= F(X2APIC);
                break;
        /* function 2 entries are STATEFUL. That is, repeated cpuid commands
         * may return different values. This forces us to get_cpu() before
@@@ -1435,6 -1610,10 +1610,10 @@@ static int kvm_dev_ioctl_get_supported_
        for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
                do_cpuid_ent(&cpuid_entries[nent], func, 0,
                             &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
        r = -EFAULT;
        if (copy_to_user(entries, cpuid_entries,
                         nent * sizeof(struct kvm_cpuid_entry2)))
@@@ -1464,6 -1643,7 +1643,7 @@@ static int kvm_vcpu_ioctl_set_lapic(str
        vcpu_load(vcpu);
        memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
        kvm_apic_post_state_restore(vcpu);
+       update_cr8_intercept(vcpu);
        vcpu_put(vcpu);
  
        return 0;
@@@ -1503,6 -1683,80 +1683,80 @@@ static int vcpu_ioctl_tpr_access_report
        return 0;
  }
  
+ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+                                       u64 mcg_cap)
+ {
+       int r;
+       unsigned bank_num = mcg_cap & 0xff, bank;
+       r = -EINVAL;
+       if (!bank_num)
+               goto out;
+       if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+               goto out;
+       r = 0;
+       vcpu->arch.mcg_cap = mcg_cap;
+       /* Init IA32_MCG_CTL to all 1s */
+       if (mcg_cap & MCG_CTL_P)
+               vcpu->arch.mcg_ctl = ~(u64)0;
+       /* Init IA32_MCi_CTL to all 1s */
+       for (bank = 0; bank < bank_num; bank++)
+               vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+ out:
+       return r;
+ }
+ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+                                     struct kvm_x86_mce *mce)
+ {
+       u64 mcg_cap = vcpu->arch.mcg_cap;
+       unsigned bank_num = mcg_cap & 0xff;
+       u64 *banks = vcpu->arch.mce_banks;
+       if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+               return -EINVAL;
+       /*
+        * if IA32_MCG_CTL is not all 1s, the uncorrected error
+        * reporting is disabled
+        */
+       if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+           vcpu->arch.mcg_ctl != ~(u64)0)
+               return 0;
+       banks += 4 * mce->bank;
+       /*
+        * if IA32_MCi_CTL is not all 1s, the uncorrected error
+        * reporting is disabled for the bank
+        */
+       if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+               return 0;
+       if (mce->status & MCI_STATUS_UC) {
+               if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+                   !(vcpu->arch.cr4 & X86_CR4_MCE)) {
+                       printk(KERN_DEBUG "kvm: set_mce: "
+                              "injects mce exception while "
+                              "previous one is in progress!\n");
+                       set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+                       return 0;
+               }
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               vcpu->arch.mcg_status = mce->mcg_status;
+               banks[1] = mce->status;
+               kvm_queue_exception(vcpu, MC_VECTOR);
+       } else if (!(banks[1] & MCI_STATUS_VAL)
+                  || !(banks[1] & MCI_STATUS_UC)) {
+               if (banks[1] & MCI_STATUS_VAL)
+                       mce->status |= MCI_STATUS_OVER;
+               banks[2] = mce->addr;
+               banks[3] = mce->misc;
+               banks[1] = mce->status;
+       } else
+               banks[1] |= MCI_STATUS_OVER;
+       return 0;
+ }
  long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
                kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
                break;
        }
+       case KVM_X86_SETUP_MCE: {
+               u64 mcg_cap;
+               r = -EFAULT;
+               if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+               break;
+       }
+       case KVM_X86_SET_MCE: {
+               struct kvm_x86_mce mce;
+               r = -EFAULT;
+               if (copy_from_user(&mce, argp, sizeof mce))
+                       goto out;
+               r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+               break;
+       }
        default:
                r = -EINVAL;
        }
@@@ -1654,6 -1926,13 +1926,13 @@@ static int kvm_vm_ioctl_set_tss_addr(st
        return ret;
  }
  
+ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+                                             u64 ident_addr)
+ {
+       kvm->arch.ept_identity_map_addr = ident_addr;
+       return 0;
+ }
  static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
                                          u32 kvm_nr_mmu_pages)
  {
@@@ -1775,19 -2054,25 +2054,25 @@@ static int kvm_vm_ioctl_set_irqchip(str
        r = 0;
        switch (chip->chip_id) {
        case KVM_IRQCHIP_PIC_MASTER:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[0],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_PIC_SLAVE:
+               spin_lock(&pic_irqchip(kvm)->lock);
                memcpy(&pic_irqchip(kvm)->pics[1],
                        &chip->chip.pic,
                        sizeof(struct kvm_pic_state));
+               spin_unlock(&pic_irqchip(kvm)->lock);
                break;
        case KVM_IRQCHIP_IOAPIC:
+               mutex_lock(&kvm->irq_lock);
                memcpy(ioapic_irqchip(kvm),
                        &chip->chip.ioapic,
                        sizeof(struct kvm_ioapic_state));
+               mutex_unlock(&kvm->irq_lock);
                break;
        default:
                r = -EINVAL;
@@@ -1801,7 -2086,9 +2086,9 @@@ static int kvm_vm_ioctl_get_pit(struct 
  {
        int r = 0;
  
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
  }
  
@@@ -1809,8 -2096,39 +2096,39 @@@ static int kvm_vm_ioctl_set_pit(struct 
  {
        int r = 0;
  
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-       kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+       kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+ }
+ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+ {
+       int r = 0;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+               sizeof(ps->channels));
+       ps->flags = kvm->arch.vpit->pit_state.flags;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+       return r;
+ }
+ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+ {
+       int r = 0, start = 0;
+       u32 prev_legacy, cur_legacy;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
+       prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+       if (!prev_legacy && cur_legacy)
+               start = 1;
+       memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
+              sizeof(kvm->arch.vpit->pit_state.channels));
+       kvm->arch.vpit->pit_state.flags = ps->flags;
+       kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return r;
  }
  
@@@ -1819,7 -2137,9 +2137,9 @@@ static int kvm_vm_ioctl_reinject(struc
  {
        if (!kvm->arch.vpit)
                return -ENXIO;
+       mutex_lock(&kvm->arch.vpit->pit_state.lock);
        kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+       mutex_unlock(&kvm->arch.vpit->pit_state.lock);
        return 0;
  }
  
@@@ -1845,7 -2165,6 +2165,6 @@@ int kvm_vm_ioctl_get_dirty_log(struct k
                spin_lock(&kvm->mmu_lock);
                kvm_mmu_slot_remove_write_access(kvm, log->slot);
                spin_unlock(&kvm->mmu_lock);
-               kvm_flush_remote_tlbs(kvm);
                memslot = &kvm->memslots[log->slot];
                n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
                memset(memslot->dirty_bitmap, 0, n);
@@@ -1869,7 -2188,9 +2188,9 @@@ long kvm_arch_vm_ioctl(struct file *fil
         */
        union {
                struct kvm_pit_state ps;
+               struct kvm_pit_state2 ps2;
                struct kvm_memory_alias alias;
+               struct kvm_pit_config pit_config;
        } u;
  
        switch (ioctl) {
                if (r < 0)
                        goto out;
                break;
+       case KVM_SET_IDENTITY_MAP_ADDR: {
+               u64 ident_addr;
+               r = -EFAULT;
+               if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+                       goto out;
+               r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+               if (r < 0)
+                       goto out;
+               break;
+       }
        case KVM_SET_MEMORY_REGION: {
                struct kvm_memory_region kvm_mem;
                struct kvm_userspace_memory_region kvm_userspace_mem;
                }
                break;
        case KVM_CREATE_PIT:
-               mutex_lock(&kvm->lock);
+               u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+               goto create_pit;
+       case KVM_CREATE_PIT2:
+               r = -EFAULT;
+               if (copy_from_user(&u.pit_config, argp,
+                                  sizeof(struct kvm_pit_config)))
+                       goto out;
+       create_pit:
+               down_write(&kvm->slots_lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
                r = -ENOMEM;
-               kvm->arch.vpit = kvm_create_pit(kvm);
+               kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->lock);
+               up_write(&kvm->slots_lock);
                break;
        case KVM_IRQ_LINE_STATUS:
        case KVM_IRQ_LINE: {
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        __s32 status;
-                       mutex_lock(&kvm->lock);
+                       mutex_lock(&kvm->irq_lock);
                        status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
                                        irq_event.irq, irq_event.level);
-                       mutex_unlock(&kvm->lock);
+                       mutex_unlock(&kvm->irq_lock);
                        if (ioctl == KVM_IRQ_LINE_STATUS) {
                                irq_event.status = status;
                                if (copy_to_user(argp, &irq_event,
                r = 0;
                break;
        }
+       case KVM_GET_PIT2: {
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+                       goto out;
+               r = 0;
+               break;
+       }
+       case KVM_SET_PIT2: {
+               r = -EFAULT;
+               if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+                       goto out;
+               r = -ENXIO;
+               if (!kvm->arch.vpit)
+                       goto out;
+               r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
        case KVM_REINJECT_CONTROL: {
                struct kvm_reinject_control control;
                r =  -EFAULT;
@@@ -2075,35 -2441,23 +2441,23 @@@ static void kvm_init_msr_list(void
        num_msrs_to_save = j;
  }
  
- /*
-  * Only apic need an MMIO device hook, so shortcut now..
-  */
- static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+                          const void *v)
  {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
  
-       if (vcpu->arch.apic) {
-               dev = &vcpu->arch.apic->dev;
-               if (dev->in_range(dev, addr, len, is_write))
-                       return dev;
-       }
-       return NULL;
+       return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
  }
  
- static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
-                                               gpa_t addr, int len,
-                                               int is_write)
+ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
  {
-       struct kvm_io_device *dev;
+       if (vcpu->arch.apic &&
+           !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
+               return 0;
  
-       dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
-       if (dev == NULL)
-               dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
-                                         is_write);
-       return dev;
+       return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
  }
  
  static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@@ -2172,11 -2526,12 +2526,12 @@@ static int emulator_read_emulated(unsig
                                  unsigned int bytes,
                                  struct kvm_vcpu *vcpu)
  {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
  
        if (vcpu->mmio_read_completed) {
                memcpy(val, vcpu->mmio_data, bytes);
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+                              vcpu->mmio_phys_addr, *(u64 *)val);
                vcpu->mmio_read_completed = 0;
                return X86EMUL_CONTINUE;
        }
@@@ -2197,14 -2552,12 +2552,12 @@@ mmio
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
-       if (mmio_dev) {
-               kvm_iodevice_read(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
+       if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
                return X86EMUL_CONTINUE;
        }
-       mutex_unlock(&vcpu->kvm->lock);
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
  
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
@@@ -2231,7 -2584,6 +2584,6 @@@ static int emulator_write_emulated_onep
                                           unsigned int bytes,
                                           struct kvm_vcpu *vcpu)
  {
-       struct kvm_io_device *mmio_dev;
        gpa_t                 gpa;
  
        gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
                return X86EMUL_CONTINUE;
  
  mmio:
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
        /*
         * Is this MMIO handled locally?
         */
-       mutex_lock(&vcpu->kvm->lock);
-       mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
-       if (mmio_dev) {
-               kvm_iodevice_write(mmio_dev, gpa, bytes, val);
-               mutex_unlock(&vcpu->kvm->lock);
+       if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
                return X86EMUL_CONTINUE;
-       }
-       mutex_unlock(&vcpu->kvm->lock);
  
        vcpu->mmio_needed = 1;
        vcpu->mmio_phys_addr = gpa;
@@@ -2297,7 -2644,12 +2644,7 @@@ static int emulator_cmpxchg_emulated(un
                                     unsigned int bytes,
                                     struct kvm_vcpu *vcpu)
  {
 -      static int reported;
 -
 -      if (!reported) {
 -              reported = 1;
 -              printk(KERN_WARNING "kvm: emulating exchange as write\n");
 -      }
 +      printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
  #ifndef CONFIG_X86_64
        /* guests cmpxchg8b have to be emulated atomically */
        if (bytes == 8) {
@@@ -2343,7 -2695,6 +2690,6 @@@ int emulate_invlpg(struct kvm_vcpu *vcp
  
  int emulate_clts(struct kvm_vcpu *vcpu)
  {
-       KVMTRACE_0D(CLTS, vcpu, handler);
        kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
        return X86EMUL_CONTINUE;
  }
@@@ -2420,7 -2771,7 +2766,7 @@@ int emulate_instruction(struct kvm_vcp
        kvm_clear_exception_queue(vcpu);
        vcpu->arch.mmio_fault_cr2 = cr2;
        /*
-        * TODO: fix x86_emulate.c to use guest_read/write_register
+        * TODO: fix emulate.c to use guest_read/write_register
         * instead of direct ->regs accesses, can save hundred cycles
         * on Intel for instructions that don't read/change RSP, for
         * for example.
  
                r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
  
-               /* Reject the instructions other than VMCALL/VMMCALL when
-                * try to emulate invalid opcode */
+               /* Only allow emulation of specific instructions on #UD
+                * (namely VMMCALL, sysenter, sysexit, syscall)*/
                c = &vcpu->arch.emulate_ctxt.decode;
-               if ((emulation_type & EMULTYPE_TRAP_UD) &&
-                   (!(c->twobyte && c->b == 0x01 &&
-                     (c->modrm_reg == 0 || c->modrm_reg == 3) &&
-                      c->modrm_mod == 3 && c->modrm_rm == 1)))
-                       return EMULATE_FAIL;
+               if (emulation_type & EMULTYPE_TRAP_UD) {
+                       if (!c->twobyte)
+                               return EMULATE_FAIL;
+                       switch (c->b) {
+                       case 0x01: /* VMMCALL */
+                               if (c->modrm_mod != 3 || c->modrm_rm != 1)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x34: /* sysenter */
+                       case 0x35: /* sysexit */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       case 0x05: /* syscall */
+                               if (c->modrm_mod != 0 || c->modrm_rm != 0)
+                                       return EMULATE_FAIL;
+                               break;
+                       default:
+                               return EMULATE_FAIL;
+                       }
+                       if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
+                               return EMULATE_FAIL;
+               }
  
                ++vcpu->stat.insn_emulation;
                if (r)  {
@@@ -2571,52 -2941,40 +2936,40 @@@ int complete_pio(struct kvm_vcpu *vcpu
        return 0;
  }
  
- static void kernel_pio(struct kvm_io_device *pio_dev,
-                      struct kvm_vcpu *vcpu,
-                      void *pd)
+ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
  {
        /* TODO: String I/O for in kernel device */
+       int r;
  
-       mutex_lock(&vcpu->kvm->lock);
        if (vcpu->arch.pio.in)
-               kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
-                                 vcpu->arch.pio.size,
-                                 pd);
+               r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                   vcpu->arch.pio.size, pd);
        else
-               kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
-                                  vcpu->arch.pio.size,
-                                  pd);
-       mutex_unlock(&vcpu->kvm->lock);
+               r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
+                                    vcpu->arch.pio.size, pd);
+       return r;
  }
  
- static void pio_string_write(struct kvm_io_device *pio_dev,
-                            struct kvm_vcpu *vcpu)
+ static int pio_string_write(struct kvm_vcpu *vcpu)
  {
        struct kvm_pio_request *io = &vcpu->arch.pio;
        void *pd = vcpu->arch.pio_data;
-       int i;
+       int i, r = 0;
  
-       mutex_lock(&vcpu->kvm->lock);
        for (i = 0; i < io->cur_count; i++) {
-               kvm_iodevice_write(pio_dev, io->port,
-                                  io->size,
-                                  pd);
+               if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
+                                    io->port, io->size, pd)) {
+                       r = -EOPNOTSUPP;
+                       break;
+               }
                pd += io->size;
        }
-       mutex_unlock(&vcpu->kvm->lock);
- }
- static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
-                                              gpa_t addr, int len,
-                                              int is_write)
- {
-       return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
+       return r;
  }
  
  int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned port)
  {
-       struct kvm_io_device *pio_dev;
        unsigned long val;
  
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->arch.pio.down = 0;
        vcpu->arch.pio.rep = 0;
  
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, 1);
  
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
  
-       pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
-       if (pio_dev) {
-               kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
+       if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
                complete_pio(vcpu);
                return 1;
        }
@@@ -2656,7 -3008,6 +3003,6 @@@ int kvm_emulate_pio_string(struct kvm_v
  {
        unsigned now, in_page;
        int ret = 0;
-       struct kvm_io_device *pio_dev;
  
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
        vcpu->arch.pio.down = down;
        vcpu->arch.pio.rep = rep;
  
-       if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
-               KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
-       else
-               KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
-                           handler);
+       trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
+                     size, count);
  
        if (!count) {
                kvm_x86_ops->skip_emulated_instruction(vcpu);
  
        vcpu->arch.pio.guest_gva = address;
  
-       pio_dev = vcpu_find_pio_dev(vcpu, port,
-                                   vcpu->arch.pio.cur_count,
-                                   !vcpu->arch.pio.in);
        if (!vcpu->arch.pio.in) {
                /* string PIO write */
                ret = pio_copy_data(vcpu);
                        kvm_inject_gp(vcpu, 0);
                        return 1;
                }
-               if (ret == 0 && pio_dev) {
-                       pio_string_write(pio_dev, vcpu);
+               if (ret == 0 && !pio_string_write(vcpu)) {
                        complete_pio(vcpu);
                        if (vcpu->arch.pio.count == 0)
                                ret = 1;
                }
-       } else if (pio_dev)
-               pr_unimpl(vcpu, "no string pio read support yet, "
-                      "port %x size %d count %ld\n",
-                       port, size, count);
+       }
+       /* no string PIO read support yet */
  
        return ret;
  }
@@@ -2756,10 -3097,7 +3092,7 @@@ static int kvmclock_cpufreq_notifier(st
  
        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-                       vcpu = kvm->vcpus[i];
-                       if (!vcpu)
-                               continue;
+               kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
                                continue;
                        if (!kvm_request_guest_time_update(vcpu))
@@@ -2852,7 -3190,6 +3185,6 @@@ void kvm_arch_exit(void
  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
  {
        ++vcpu->stat.halt_exits;
-       KVMTRACE_0D(HLT, vcpu, handler);
        if (irqchip_in_kernel(vcpu->kvm)) {
                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
                return 1;
@@@ -2883,7 -3220,7 +3215,7 @@@ int kvm_emulate_hypercall(struct kvm_vc
        a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
        a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
  
-       KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+       trace_kvm_hypercall(nr, a0, a1, a2, a3);
  
        if (!is_long_mode(vcpu)) {
                nr &= 0xFFFFFFFF;
                a3 &= 0xFFFFFFFF;
        }
  
+       if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+               ret = -KVM_EPERM;
+               goto out;
+       }
        switch (nr) {
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                ret = -KVM_ENOSYS;
                break;
        }
+ out:
        kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
        ++vcpu->stat.hypercalls;
        return r;
@@@ -2983,8 -3326,6 +3321,6 @@@ unsigned long realmode_get_cr(struct kv
                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
                return 0;
        }
-       KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
-                   (u32)((u64)value >> 32), handler);
  
        return value;
  }
  void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
                     unsigned long *rflags)
  {
-       KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
-                   (u32)((u64)val >> 32), handler);
        switch (cr) {
        case 0:
                kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@@ -3104,11 -3442,11 +3437,11 @@@ void kvm_emulate_cpuid(struct kvm_vcpu 
                kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
        }
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-       KVMTRACE_5D(CPUID, vcpu, function,
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
-                   (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
+       trace_kvm_cpuid(function,
+                       kvm_register_read(vcpu, VCPU_REGS_RAX),
+                       kvm_register_read(vcpu, VCPU_REGS_RBX),
+                       kvm_register_read(vcpu, VCPU_REGS_RCX),
+                       kvm_register_read(vcpu, VCPU_REGS_RDX));
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
  
@@@ -3174,6 -3512,9 +3507,9 @@@ static void update_cr8_intercept(struc
        if (!kvm_x86_ops->update_cr8_intercept)
                return;
  
+       if (!vcpu->arch.apic)
+               return;
        if (!vcpu->arch.apic->vapic_addr)
                max_irr = kvm_lapic_find_highest_irr(vcpu);
        else
        kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
  }
  
- static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
  {
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
        /* try to reinject previous events if any */
+       if (vcpu->arch.exception.pending) {
+               kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
+                                         vcpu->arch.exception.has_error_code,
+                                         vcpu->arch.exception.error_code);
+               return;
+       }
        if (vcpu->arch.nmi_injected) {
                kvm_x86_ops->set_nmi(vcpu);
                return;
@@@ -3266,16 -3611,14 +3606,14 @@@ static int vcpu_enter_guest(struct kvm_
        smp_mb__after_clear_bit();
  
        if (vcpu->requests || need_resched() || signal_pending(current)) {
+               set_bit(KVM_REQ_KICK, &vcpu->requests);
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
  
-       if (vcpu->arch.exception.pending)
-               __queue_exception(vcpu);
-       else
-               inject_pending_irq(vcpu, kvm_run);
+       inject_pending_event(vcpu, kvm_run);
  
        /* enable NMI/IRQ window open exits if needed */
        if (vcpu->arch.nmi_pending)
  
        kvm_guest_enter();
  
-       get_debugreg(vcpu->arch.host_dr6, 6);
-       get_debugreg(vcpu->arch.host_dr7, 7);
        if (unlikely(vcpu->arch.switch_db_regs)) {
-               get_debugreg(vcpu->arch.host_db[0], 0);
-               get_debugreg(vcpu->arch.host_db[1], 1);
-               get_debugreg(vcpu->arch.host_db[2], 2);
-               get_debugreg(vcpu->arch.host_db[3], 3);
                set_debugreg(0, 7);
                set_debugreg(vcpu->arch.eff_db[0], 0);
                set_debugreg(vcpu->arch.eff_db[1], 1);
                set_debugreg(vcpu->arch.eff_db[3], 3);
        }
  
-       KVMTRACE_0D(VMENTRY, vcpu, entryexit);
+       trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu, kvm_run);
  
-       if (unlikely(vcpu->arch.switch_db_regs)) {
-               set_debugreg(0, 7);
-               set_debugreg(vcpu->arch.host_db[0], 0);
-               set_debugreg(vcpu->arch.host_db[1], 1);
-               set_debugreg(vcpu->arch.host_db[2], 2);
-               set_debugreg(vcpu->arch.host_db[3], 3);
+       if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
+               set_debugreg(current->thread.debugreg0, 0);
+               set_debugreg(current->thread.debugreg1, 1);
+               set_debugreg(current->thread.debugreg2, 2);
+               set_debugreg(current->thread.debugreg3, 3);
+               set_debugreg(current->thread.debugreg6, 6);
+               set_debugreg(current->thread.debugreg7, 7);
        }
-       set_debugreg(vcpu->arch.host_dr6, 6);
-       set_debugreg(vcpu->arch.host_dr7, 7);
  
        set_bit(KVM_REQ_KICK, &vcpu->requests);
        local_irq_enable();
@@@ -3648,11 -3983,8 +3978,8 @@@ static void kvm_set_segment(struct kvm_
  static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
                                   struct kvm_segment *kvm_desct)
  {
-       kvm_desct->base = seg_desc->base0;
-       kvm_desct->base |= seg_desc->base1 << 16;
-       kvm_desct->base |= seg_desc->base2 << 24;
-       kvm_desct->limit = seg_desc->limit0;
-       kvm_desct->limit |= seg_desc->limit << 16;
+       kvm_desct->base = get_desc_base(seg_desc);
+       kvm_desct->limit = get_desc_limit(seg_desc);
        if (seg_desc->g) {
                kvm_desct->limit <<= 12;
                kvm_desct->limit |= 0xfff;
@@@ -3696,7 -4028,6 +4023,6 @@@ static void get_segment_descriptor_dtab
  static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                         struct desc_struct *seg_desc)
  {
-       gpa_t gpa;
        struct descriptor_table dtable;
        u16 index = selector >> 3;
  
                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
                return 1;
        }
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
-       gpa += index * 8;
-       return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
+       return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
  }
  
  /* allowed just for 8 bytes segments */
  static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                         struct desc_struct *seg_desc)
  {
-       gpa_t gpa;
        struct descriptor_table dtable;
        u16 index = selector >> 3;
  
  
        if (dtable.limit < index * 8 + 7)
                return 1;
-       gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
-       gpa += index * 8;
-       return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
+       return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
  }
  
  static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
                             struct desc_struct *seg_desc)
  {
-       u32 base_addr;
-       base_addr = seg_desc->base0;
-       base_addr |= (seg_desc->base1 << 16);
-       base_addr |= (seg_desc->base2 << 24);
+       u32 base_addr = get_desc_base(seg_desc);
  
        return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
  }
@@@ -3780,12 -4102,19 +4097,19 @@@ static int kvm_load_realmode_segment(st
        return 0;
  }
  
+ static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+ {
+       return (seg != VCPU_SREG_LDTR) &&
+               (seg != VCPU_SREG_TR) &&
+               (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
+ }
  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                int type_bits, int seg)
  {
        struct kvm_segment kvm_seg;
  
-       if (!(vcpu->arch.cr0 & X86_CR0_PE))
+       if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
                return kvm_load_realmode_segment(vcpu, selector, seg);
        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
                return 1;
@@@ -4024,7 -4353,7 +4348,7 @@@ int kvm_task_switch(struct kvm_vcpu *vc
                }
        }
  
-       if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+       if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
                kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
                return 1;
        }
@@@ -4094,13 -4423,7 +4418,7 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
  
        vcpu->arch.cr2 = sregs->cr2;
        mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-       down_read(&vcpu->kvm->slots_lock);
-       if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
-               vcpu->arch.cr3 = sregs->cr3;
-       else
-               set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
-       up_read(&vcpu->kvm->slots_lock);
+       vcpu->arch.cr3 = sregs->cr3;
  
        kvm_set_cr8(vcpu, sregs->cr8);
  
        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
  
+       update_cr8_intercept(vcpu);
        /* Older userspace won't unhalt the vcpu on reset. */
-       if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+       if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
            sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
            !(vcpu->arch.cr0 & X86_CR0_PE))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@@ -4414,7 -4739,7 +4734,7 @@@ int kvm_arch_vcpu_init(struct kvm_vcpu 
        kvm = vcpu->kvm;
  
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-       if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
+       if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        else
                vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
                        goto fail_mmu_destroy;
        }
  
+       vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+                                      GFP_KERNEL);
+       if (!vcpu->arch.mce_banks) {
+               r = -ENOMEM;
+               goto fail_mmu_destroy;
+       }
+       vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
        return 0;
  
  fail_mmu_destroy:
@@@ -4483,20 -4816,22 +4811,22 @@@ static void kvm_unload_vcpu_mmu(struct 
  static void kvm_free_vcpus(struct kvm *kvm)
  {
        unsigned int i;
+       struct kvm_vcpu *vcpu;
  
        /*
         * Unpin any mmu pages first.
         */
-       for (i = 0; i < KVM_MAX_VCPUS; ++i)
-               if (kvm->vcpus[i])
-                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               if (kvm->vcpus[i]) {
-                       kvm_arch_vcpu_free(kvm->vcpus[i]);
-                       kvm->vcpus[i] = NULL;
-               }
-       }
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_unload_vcpu_mmu(vcpu);
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               kvm_arch_vcpu_free(vcpu);
+       mutex_lock(&kvm->lock);
+       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+               kvm->vcpus[i] = NULL;
  
+       atomic_set(&kvm->online_vcpus, 0);
+       mutex_unlock(&kvm->lock);
  }
  
  void kvm_arch_sync_events(struct kvm *kvm)
@@@ -4573,7 -4908,6 +4903,6 @@@ int kvm_arch_set_memory_region(struct k
  
        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        spin_unlock(&kvm->mmu_lock);
-       kvm_flush_remote_tlbs(kvm);
  
        return 0;
  }
@@@ -4587,8 -4921,10 +4916,10 @@@ void kvm_arch_flush_shadow(struct kvm *
  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
  {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-              || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
-              || vcpu->arch.nmi_pending;
+               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || vcpu->arch.nmi_pending ||
+               (kvm_arch_interrupt_allowed(vcpu) &&
+                kvm_cpu_has_interrupt(vcpu));
  }
  
  void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@@ -4612,3 -4948,9 +4943,9 @@@ int kvm_arch_interrupt_allowed(struct k
  {
        return kvm_x86_ops->interrupt_allowed(vcpu);
  }
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --combined arch/x86/mm/highmem_32.c
@@@ -24,7 -24,7 +24,7 @@@ void kunmap(struct page *page
   * no global lock is needed and because the kmap code must perform a global TLB
   * invalidation when the kmap pool wraps.
   *
 - * However when holding an atomic kmap is is not legal to sleep, so atomic
 + * However when holding an atomic kmap it is not legal to sleep, so atomic
   * kmaps are appropriate for short, tight code paths only.
   */
  void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
@@@ -104,6 -104,7 +104,7 @@@ EXPORT_SYMBOL(kunmap)
  EXPORT_SYMBOL(kmap_atomic);
  EXPORT_SYMBOL(kunmap_atomic);
  EXPORT_SYMBOL(kmap_atomic_prot);
+ EXPORT_SYMBOL(kmap_atomic_to_page);
  
  void __init set_highmem_pages_init(void)
  {