Merge branches 'x86/xen', 'x86/build', 'x86/microcode', 'x86/mm-debug-v2', 'x86/memor...
Ingo Molnar [Sun, 12 Oct 2008 13:50:02 +0000 (15:50 +0200)]
105 files changed:
Documentation/kernel-parameters.txt
MAINTAINERS
arch/ia64/include/asm/siginfo.h
arch/powerpc/include/asm/siginfo.h
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Kconfig.debug
arch/x86/Makefile_32.cpu
arch/x86/boot/Makefile
arch/x86/boot/compressed/Makefile
arch/x86/boot/edd.c
arch/x86/boot/video-vesa.c
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/ia32/ia32_signal.c
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/doublefault_32.c
arch/x86/kernel/early-quirks.c
arch/x86/kernel/early_printk.c
arch/x86/kernel/i387.c
arch/x86/kernel/io_apic_64.c
arch/x86/kernel/ldt.c
arch/x86/kernel/microcode.c [deleted file]
arch/x86/kernel/microcode_amd.c [new file with mode: 0644]
arch/x86/kernel/microcode_core.c [new file with mode: 0644]
arch/x86/kernel/microcode_intel.c [new file with mode: 0644]
arch/x86/kernel/paravirt-spinlocks.c [new file with mode: 0644]
arch/x86/kernel/paravirt.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal_32.c
arch/x86/kernel/signal_64.c
arch/x86/kernel/smp.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/tlb_32.c
arch/x86/kernel/traps_32.c
arch/x86/kernel/traps_64.c
arch/x86/kernel/vmlinux_64.lds.S
arch/x86/kernel/xsave.c
arch/x86/mm/fault.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/ioremap.c
arch/x86/xen/Kconfig
arch/x86/xen/Makefile
arch/x86/xen/debugfs.c [new file with mode: 0644]
arch/x86/xen/debugfs.h [new file with mode: 0644]
arch/x86/xen/enlighten.c
arch/x86/xen/irq.c [new file with mode: 0644]
arch/x86/xen/mmu.c
arch/x86/xen/mmu.h
arch/x86/xen/multicalls.c
arch/x86/xen/smp.c
arch/x86/xen/spinlock.c [new file with mode: 0644]
arch/x86/xen/time.c
arch/x86/xen/xen-asm_32.S
arch/x86/xen/xen-asm_64.S
arch/x86/xen/xen-ops.h
drivers/block/xen-blkfront.c
drivers/char/hvc_xen.c
drivers/input/xen-kbdfront.c
drivers/net/xen-netfront.c
drivers/usb/host/ehci.h
drivers/video/Kconfig
drivers/video/console/Kconfig
drivers/video/xen-fbfront.c
drivers/xen/Makefile
drivers/xen/balloon.c
drivers/xen/cpu_hotplug.c [new file with mode: 0644]
drivers/xen/events.c
drivers/xen/grant-table.c
drivers/xen/xenbus/xenbus_probe.c
include/asm-generic/siginfo.h
include/asm-parisc/siginfo.h
include/asm-x86/bios_ebda.h
include/asm-x86/boot.h
include/asm-x86/desc.h
include/asm-x86/microcode.h [new file with mode: 0644]
include/asm-x86/mmzone_64.h
include/asm-x86/page_32.h
include/asm-x86/paravirt.h
include/asm-x86/processor.h
include/asm-x86/ptrace.h
include/asm-x86/smp.h
include/asm-x86/spinlock.h
include/asm-x86/tlbflush.h
include/asm-x86/traps.h
include/asm-x86/xen/hypervisor.h
include/linux/kernel.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmdebug.h [new file with mode: 0644]
include/linux/sched.h
include/linux/usb/ehci_def.h [new file with mode: 0644]
include/xen/balloon.h [deleted file]
include/xen/events.h
lib/Kconfig.debug
lib/cmdline.c
mm/vmalloc.c
scripts/Kbuild.include

index 25efbaf..2443f5b 100644 (file)
@@ -658,11 +658,12 @@ and is between 256 and 4096 characters. It is defined in the file
        earlyprintk=    [X86-32,X86-64,SH,BLACKFIN]
                        earlyprintk=vga
                        earlyprintk=serial[,ttySn[,baudrate]]
+                       earlyprintk=dbgp
 
                        Append ",keep" to not disable it when the real console
                        takes over.
 
-                       Only vga or serial at a time, not both.
+                       Only vga or serial or usb debug port at a time.
 
                        Currently only ttyS0 and ttyS1 are supported.
 
@@ -1231,6 +1232,29 @@ and is between 256 and 4096 characters. It is defined in the file
                                 or
                                 memmap=0x10000$0x18690000
 
+       memory_corruption_check=0/1 [X86]
+                       Some BIOSes seem to corrupt the first 64k of
+                       memory when doing things like suspend/resume.
+                       Setting this option will scan the memory
+                       looking for corruption.  Enabling this will
+                       both detect corruption and prevent the kernel
+                       from using the memory being corrupted.
+                       However, its intended as a diagnostic tool; if
+                       repeatable BIOS-originated corruption always
+                       affects the same memory, you can use memmap=
+                       to prevent the kernel from using that memory.
+
+       memory_corruption_check_size=size [X86]
+                       By default it checks for corruption in the low
+                       64k, making this memory unavailable for normal
+                       use.  Use this parameter to scan for
+                       corruption in more or less memory.
+
+       memory_corruption_check_period=seconds [X86]
+                       By default it checks for corruption every 60
+                       seconds.  Use this parameter to check at some
+                       other rate.  0 disables periodic checking.
+
        memtest=        [KNL,X86] Enable memtest
                        Format: <integer>
                        range: 0,4 : pattern number
index 587f418..8bf72d3 100644 (file)
@@ -390,6 +390,11 @@ L: iommu@lists.linux-foundation.org
 T:     git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu.git
 S:     Supported
 
+AMD MICROCODE UPDATE SUPPORT
+P:      Peter Oruba
+M:      peter.oruba@amd.com
+S:      Supported
+
 AMS (Apple Motion Sensor) DRIVER
 P:     Stelian Pop
 M:     stelian@popies.net
index 9294e4b..118d429 100644 (file)
@@ -113,11 +113,6 @@ typedef struct siginfo {
 #undef NSIGSEGV
 #define NSIGSEGV       3
 
-/*
- * SIGTRAP si_codes
- */
-#define TRAP_BRANCH    (__SI_FAULT|3)  /* process taken branch trap */
-#define TRAP_HWBKPT    (__SI_FAULT|4)  /* hardware breakpoint or watchpoint */
 #undef NSIGTRAP
 #define NSIGTRAP       4
 
index 12f1bce..49495b0 100644 (file)
 
 #include <asm-generic/siginfo.h>
 
-/*
- * SIGTRAP si_codes
- */
-#define TRAP_BRANCH    (__SI_FAULT|3)  /* process taken branch trap */
-#define TRAP_HWBKPT    (__SI_FAULT|4)  /* hardware breakpoint or watchpoint */
 #undef NSIGTRAP
 #define NSIGTRAP       4
 
index 44d4f21..fc8351f 100644 (file)
@@ -778,23 +778,45 @@ config X86_REBOOTFIXUPS
          Say N otherwise.
 
 config MICROCODE
-       tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+       tristate "/dev/cpu/microcode - microcode support"
        select FW_LOADER
        ---help---
          If you say Y here, you will be able to update the microcode on
-         Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II,
-         Pentium III, Pentium 4, Xeon etc.  You will obviously need the
-         actual microcode binary data itself which is not shipped with the
-         Linux kernel.
+         certain Intel and AMD processors. The Intel support is for the
+         IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
+         Pentium 4, Xeon etc. The AMD support is for family 0x10 and
+         0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
+         You will obviously need the actual microcode binary data itself
+         which is not shipped with the Linux kernel.
 
-         For latest news and information on obtaining all the required
-         ingredients for this driver, check:
-         <http://www.urbanmyth.org/microcode/>.
+         This option selects the general module only, you need to select
+         at least one vendor specific module as well.
 
          To compile this driver as a module, choose M here: the
          module will be called microcode.
 
-config MICROCODE_OLD_INTERFACE
+config MICROCODE_INTEL
+       bool "Intel microcode patch loading support"
+       depends on MICROCODE
+       default MICROCODE
+       select FW_LOADER
+       --help---
+         This options enables microcode patch loading support for Intel
+         processors.
+
+         For latest news and information on obtaining all the required
+         Intel ingredients for this driver, check:
+         <http://www.urbanmyth.org/microcode/>.
+
+config MICROCODE_AMD
+       bool "AMD microcode patch loading support"
+       depends on MICROCODE
+       select FW_LOADER
+       --help---
+         If you select this option, microcode patch loading support for AMD
+        processors will be enabled.
+
+   config MICROCODE_OLD_INTERFACE
        def_bool y
        depends on MICROCODE
 
@@ -1061,6 +1083,56 @@ config HIGHPTE
          low memory.  Setting this option will put user-space page table
          entries in high memory.
 
+config X86_CHECK_BIOS_CORRUPTION
+        bool "Check for low memory corruption"
+       help
+        Periodically check for memory corruption in low memory, which
+        is suspected to be caused by BIOS.  Even when enabled in the
+        configuration, it is disabled at runtime.  Enable it by
+        setting "memory_corruption_check=1" on the kernel command
+        line.  By default it scans the low 64k of memory every 60
+        seconds; see the memory_corruption_check_size and
+        memory_corruption_check_period parameters in
+        Documentation/kernel-parameters.txt to adjust this.
+
+        When enabled with the default parameters, this option has
+        almost no overhead, as it reserves a relatively small amount
+        of memory and scans it infrequently.  It both detects corruption
+        and prevents it from affecting the running system.
+
+        It is, however, intended as a diagnostic tool; if repeatable
+        BIOS-originated corruption always affects the same memory,
+        you can use memmap= to prevent the kernel from using that
+        memory.
+
+config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+        bool "Set the default setting of memory_corruption_check"
+       depends on X86_CHECK_BIOS_CORRUPTION
+       default y
+       help
+        Set whether the default state of memory_corruption_check is
+        on or off.
+
+config X86_RESERVE_LOW_64K
+        bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
+       default y
+       help
+        Reserve the first 64K of physical RAM on BIOSes that are known
+        to potentially corrupt that memory range. A numbers of BIOSes are
+        known to utilize this area during suspend/resume, so it must not
+        be used by the kernel.
+
+        Set this to N if you are absolutely sure that you trust the BIOS
+        to get all its memory reservations and usages right.
+
+        If you have doubts about the BIOS (e.g. suspend/resume does not
+        work or there's kernel crashes after certain hardware hotplug
+        events) and it's not AMI or Phoenix, then you might want to enable
+        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
+        corruption patterns.
+
+        Say Y if unsure.
+
 config MATH_EMULATION
        bool
        prompt "Math emulation" if X86_32
index f8843c3..c5f1013 100644 (file)
@@ -420,7 +420,6 @@ config X86_DEBUGCTLMSR
        depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
 
 menuconfig PROCESSOR_SELECT
-       default y
        bool "Supported processor vendors" if EMBEDDED
        help
          This lets you choose what x86 vendor support code your kernel
@@ -430,48 +429,97 @@ config CPU_SUP_INTEL
        default y
        bool "Support Intel processors" if PROCESSOR_SELECT
        help
-         This enables extended support for Intel processors
+         This enables detection, tunings and quirks for Intel processors
+
+         You need this enabled if you want your kernel to run on an
+         Intel CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on an Intel
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_CYRIX_32
        default y
        bool "Support Cyrix processors" if PROCESSOR_SELECT
        depends on !64BIT
        help
-         This enables extended support for Cyrix processors
+         This enables detection, tunings and quirks for Cyrix processors
+
+         You need this enabled if you want your kernel to run on a
+         Cyrix CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on a Cyrix
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_AMD
        default y
        bool "Support AMD processors" if PROCESSOR_SELECT
        help
-         This enables extended support for AMD processors
+         This enables detection, tunings and quirks for AMD processors
+
+         You need this enabled if you want your kernel to run on an
+         AMD CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on an AMD
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_CENTAUR_32
        default y
        bool "Support Centaur processors" if PROCESSOR_SELECT
        depends on !64BIT
        help
-         This enables extended support for Centaur processors
+         This enables detection, tunings and quirks for Centaur processors
+
+         You need this enabled if you want your kernel to run on a
+         Centaur CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on a Centaur
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_CENTAUR_64
        default y
        bool "Support Centaur processors" if PROCESSOR_SELECT
        depends on 64BIT
        help
-         This enables extended support for Centaur processors
+         This enables detection, tunings and quirks for Centaur processors
+
+         You need this enabled if you want your kernel to run on a
+         Centaur CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on a Centaur
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_TRANSMETA_32
        default y
        bool "Support Transmeta processors" if PROCESSOR_SELECT
        depends on !64BIT
        help
-         This enables extended support for Transmeta processors
+         This enables detection, tunings and quirks for Transmeta processors
+
+         You need this enabled if you want your kernel to run on a
+         Transmeta CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on a Transmeta
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config CPU_SUP_UMC_32
        default y
        bool "Support UMC processors" if PROCESSOR_SELECT
        depends on !64BIT
        help
-         This enables extended support for UMC processors
+         This enables detection, tunings and quirks for UMC processors
+
+         You need this enabled if you want your kernel to run on a
+         UMC CPU. Disabling this option on other types of CPUs
+         makes the kernel a tiny bit smaller. Disabling it on a UMC
+         CPU might render the kernel unbootable.
+
+         If unsure, say N.
 
 config X86_DS
        bool "Debug Store support"
index 092f019..2a3dfbd 100644 (file)
@@ -43,6 +43,19 @@ config EARLY_PRINTK
          with klogd/syslogd or the X server. You should normally N here,
          unless you want to debug such a crash.
 
+config EARLY_PRINTK_DBGP
+       bool "Early printk via EHCI debug port"
+       default n
+       depends on EARLY_PRINTK && PCI
+       help
+         Write kernel log output directly into the EHCI debug port.
+
+         This is useful for kernel debugging when your machine crashes very
+         early before the console code is initialized. For normal operation
+         it is not recommended because it looks ugly and doesn't cooperate
+         with klogd/syslogd or the X server. You should normally N here,
+         unless you want to debug such a crash. You need usb debug device.
+
 config DEBUG_STACKOVERFLOW
        bool "Check for stack overflows"
        depends on DEBUG_KERNEL
index e372b58..b72b4f7 100644 (file)
@@ -45,3 +45,8 @@ cflags-$(CONFIG_MGEODEGX1)    += -march=pentium-mmx
 # cpu entries
 cflags-$(CONFIG_X86_GENERIC)   += $(call tune,generic,$(call tune,i686))
 
+# Bug fix for binutils: this option is required in order to keep
+# binutils from generating NOPL instructions against our will.
+ifneq ($(CONFIG_X86_P6_NOP),y)
+cflags-y                       += $(call cc-option,-Wa$(comma)-mtune=generic32,)
+endif
index 7ee102f..cd48c72 100644 (file)
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
 KBUILD_CFLAGS +=   $(call cc-option,-m32)
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 
-$(obj)/zImage:  IMAGE_OFFSET := 0x1000
 $(obj)/zImage:  asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: IMAGE_OFFSET := 0x100000
 $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
 $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
 $(obj)/bzImage: BUILDFLAGS   := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
        $(call if_changed,objcopy)
 
 $(obj)/compressed/vmlinux: FORCE
-       $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
+       $(Q)$(MAKE) $(build)=$(obj)/compressed $@
 
 # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
 FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
        mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
                -no-emul-boot -boot-load-size 4 -boot-info-table \
                $(obj)/isoimage
+       isohybrid $(obj)/image.iso 2>/dev/null || true
        rm -rf $(obj)/isoimage
 
 zlilo: $(BOOTIMAGE)
index 92fdd35..1771c80 100644 (file)
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
        $(call if_changed,objcopy)
 
 
-ifeq ($(CONFIG_X86_32),y)
-targets += vmlinux.bin.all vmlinux.relocs
-hostprogs-y := relocs
+targets += vmlinux.bin.all vmlinux.relocs relocs
+hostprogs-$(CONFIG_X86_32) += relocs
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD   $@
 $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,relocbin)
 
+ifeq ($(CONFIG_X86_32),y)
+
 ifdef CONFIG_RELOCATABLE
 $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
        $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
 LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
 endif
 
-
 $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
        $(call if_changed,ld)
index d93cbc6..1aae8f3 100644 (file)
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
        char *mbrbuf_ptr, *mbrbuf_end;
        u32 buf_base, mbr_base;
        extern char _end[];
+       u16 mbr_magic;
 
        sector_size = ei->params.bytes_per_sector;
        if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
        if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
                return -1;
 
+       memset(mbrbuf_ptr, 0, sector_size);
        if (read_mbr(devno, mbrbuf_ptr))
                return -1;
 
        *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
-       return 0;
+       mbr_magic = *(u16 *)&mbrbuf_ptr[510];
+
+       /* check for valid MBR magic */
+       return mbr_magic == 0xAA55 ? 0 : -1;
 }
 
 static int get_edd_info(u8 devno, struct edd_info *ei)
index 401ad99..1e6fe02 100644 (file)
@@ -224,7 +224,7 @@ static void vesa_store_pm_info(void)
 static void vesa_store_mode_params_graphics(void)
 {
        /* Tell the kernel we're in VESA graphics mode */
-       boot_params.screen_info.orig_video_isVGA = 0x23;
+       boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
 
        /* Mode parameters */
        boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
index ef9a520..ca226ca 100644 (file)
@@ -1535,7 +1535,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
 CONFIG_VGA_CONSOLE=y
 CONFIG_VGACON_SOFT_SCROLLBACK=y
 CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_VIDEO_SELECT=y
 CONFIG_DUMMY_CONSOLE=y
 # CONFIG_FRAMEBUFFER_CONSOLE is not set
 CONFIG_LOGO=y
index e620ea6..2c4b1c7 100644 (file)
@@ -1505,7 +1505,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
 CONFIG_VGA_CONSOLE=y
 CONFIG_VGACON_SOFT_SCROLLBACK=y
 CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
-CONFIG_VIDEO_SELECT=y
 CONFIG_DUMMY_CONSOLE=y
 # CONFIG_FRAMEBUFFER_CONSOLE is not set
 CONFIG_LOGO=y
index 8d64c1b..4bc02b2 100644 (file)
@@ -351,31 +351,28 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
        savesegment(es, tmp);
        err |= __put_user(tmp, (unsigned int __user *)&sc->es);
 
-       err |= __put_user((u32)regs->di, &sc->di);
-       err |= __put_user((u32)regs->si, &sc->si);
-       err |= __put_user((u32)regs->bp, &sc->bp);
-       err |= __put_user((u32)regs->sp, &sc->sp);
-       err |= __put_user((u32)regs->bx, &sc->bx);
-       err |= __put_user((u32)regs->dx, &sc->dx);
-       err |= __put_user((u32)regs->cx, &sc->cx);
-       err |= __put_user((u32)regs->ax, &sc->ax);
-       err |= __put_user((u32)regs->cs, &sc->cs);
-       err |= __put_user((u32)regs->ss, &sc->ss);
+       err |= __put_user(regs->di, &sc->di);
+       err |= __put_user(regs->si, &sc->si);
+       err |= __put_user(regs->bp, &sc->bp);
+       err |= __put_user(regs->sp, &sc->sp);
+       err |= __put_user(regs->bx, &sc->bx);
+       err |= __put_user(regs->dx, &sc->dx);
+       err |= __put_user(regs->cx, &sc->cx);
+       err |= __put_user(regs->ax, &sc->ax);
+       err |= __put_user(regs->cs, &sc->cs);
+       err |= __put_user(regs->ss, &sc->ss);
        err |= __put_user(current->thread.trap_no, &sc->trapno);
        err |= __put_user(current->thread.error_code, &sc->err);
-       err |= __put_user((u32)regs->ip, &sc->ip);
-       err |= __put_user((u32)regs->flags, &sc->flags);
-       err |= __put_user((u32)regs->sp, &sc->sp_at_signal);
+       err |= __put_user(regs->ip, &sc->ip);
+       err |= __put_user(regs->flags, &sc->flags);
+       err |= __put_user(regs->sp, &sc->sp_at_signal);
 
        tmp = save_i387_xstate_ia32(fpstate);
        if (tmp < 0)
                err = -EFAULT;
-       else {
-               clear_used_math();
-               stts();
+       else
                err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
                                        &sc->fpstate);
-       }
 
        /* non-iBCS2 extensions.. */
        err |= __put_user(mask, &sc->oldmask);
@@ -444,21 +441,18 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
        frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
+               return -EFAULT;
 
-       err |= __put_user(sig, &frame->sig);
-       if (err)
-               goto give_sigsegv;
+       if (__put_user(sig, &frame->sig))
+               return -EFAULT;
 
-       err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]);
-       if (err)
-               goto give_sigsegv;
+       if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+               return -EFAULT;
 
        if (_COMPAT_NSIG_WORDS > 1) {
-               err |= __copy_to_user(frame->extramask, &set->sig[1],
-                                     sizeof(frame->extramask));
-               if (err)
-                       goto give_sigsegv;
+               if (__copy_to_user(frame->extramask, &set->sig[1],
+                                  sizeof(frame->extramask)))
+                       return -EFAULT;
        }
 
        if (ka->sa.sa_flags & SA_RESTORER) {
@@ -479,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
         */
        err |= __copy_to_user(frame->retcode, &code, 8);
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up registers for signal handler */
        regs->sp = (unsigned long) frame;
@@ -502,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 #endif
 
        return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
 }
 
 int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -533,14 +523,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
        frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
+               return -EFAULT;
 
        err |= __put_user(sig, &frame->sig);
        err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
        err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
        err |= copy_siginfo_to_user32(&frame->info, info);
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Create the ucontext.  */
        if (cpu_has_xsave)
@@ -556,7 +546,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                                     regs, set->sig[0]);
        err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        if (ka->sa.sa_flags & SA_RESTORER)
                restorer = ka->sa.sa_restorer;
@@ -571,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
         */
        err |= __copy_to_user(frame->retcode, &code, 8);
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up registers for signal handler */
        regs->sp = (unsigned long) frame;
@@ -599,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 #endif
 
        return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
 }
index c9be69f..5098585 100644 (file)
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
 # Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
-CFLAGS_REMOVE_paravirt.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 endif
 
 #
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
 obj-$(CONFIG_MCA)              += mca_32.o
 obj-$(CONFIG_X86_MSR)          += msr.o
 obj-$(CONFIG_X86_CPUID)                += cpuid.o
-obj-$(CONFIG_MICROCODE)                += microcode.o
 obj-$(CONFIG_PCI)              += early-quirks.o
 apm-y                          := apm_32.o
 obj-$(CONFIG_APM)              += apm.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST)   += test_nx.o
 obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)                += kvm.o
 obj-$(CONFIG_KVM_CLOCK)                += kvmclock.o
-obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
@@ -100,6 +99,11 @@ scx200-y                    += scx200_32.o
 
 obj-$(CONFIG_OLPC)             += olpc.o
 
+microcode-y                            := microcode_core.o
+microcode-$(CONFIG_MICROCODE_INTEL)    += microcode_intel.o
+microcode-$(CONFIG_MICROCODE_AMD)      += microcode_amd.o
+obj-$(CONFIG_MICROCODE)                        += microcode.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
index c2ac1b4..eb875cd 100644 (file)
@@ -1418,8 +1418,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
  */
 static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
-       pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
-       acpi_skip_timer_override = 1;
+       /*
+        * The ati_ixp4x0_rev() early PCI quirk should have set
+        * the acpi_skip_timer_override flag already:
+        */
+       if (!acpi_skip_timer_override) {
+               WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
+               pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+                       d->ident);
+               acpi_skip_timer_override = 1;
+       }
        return 0;
 }
 
index 148fcfe..4cd8083 100644 (file)
@@ -723,9 +723,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
        init_iommu_from_acpi(iommu, h);
        init_iommu_devices(iommu);
 
-       pci_enable_device(iommu->dev);
-
-       return 0;
+       return pci_enable_device(iommu->dev);
 }
 
 /*
index 7581b62..fb789dd 100644 (file)
@@ -1121,16 +1121,5 @@ void __cpuinit cpu_init(void)
        xsave_init();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-void __cpuinit cpu_uninit(void)
-{
-       int cpu = raw_smp_processor_id();
-       cpu_clear(cpu, cpu_initialized);
-
-       /* lazy TLB state */
-       per_cpu(cpu_tlbstate, cpu).state = 0;
-       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
-}
-#endif
 
 #endif
index a47798b..395acb1 100644 (file)
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
                .ds             = __USER_DS,
                .fs             = __KERNEL_PERCPU,
 
-               .__cr3          = __pa(swapper_pg_dir)
+               .__cr3          = __phys_addr_const((unsigned long)swapper_pg_dir)
        }
 };
index 24bb5fa..733c4f8 100644 (file)
@@ -95,6 +95,52 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
+static u32 ati_ixp4x0_rev(int num, int slot, int func)
+{
+       u32 d;
+       u8  b;
+
+       b = read_pci_config_byte(num, slot, func, 0xac);
+       b &= ~(1<<5);
+       write_pci_config_byte(num, slot, func, 0xac, b);
+
+       d = read_pci_config(num, slot, func, 0x70);
+       d |= 1<<8;
+       write_pci_config(num, slot, func, 0x70, d);
+
+       d = read_pci_config(num, slot, func, 0x8);
+       d &= 0xff;
+       return d;
+}
+
+static void __init ati_bugs(int num, int slot, int func)
+{
+#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC)
+       u32 d;
+       u8  b;
+
+       if (acpi_use_timer_override)
+               return;
+
+       d = ati_ixp4x0_rev(num, slot, func);
+       if (d  < 0x82)
+               acpi_skip_timer_override = 1;
+       else {
+               /* check for IRQ0 interrupt swap */
+               outb(0x72, 0xcd6); b = inb(0xcd7);
+               if (!(b & 0x2))
+                       acpi_skip_timer_override = 1;
+       }
+
+       if (acpi_skip_timer_override) {
+               printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
+               printk(KERN_INFO "Ignoring ACPI timer override.\n");
+               printk(KERN_INFO "If you got timer trouble "
+                      "try acpi_use_timer_override\n");
+       }
+#endif
+}
+
 #ifdef CONFIG_DMAR
 static void __init intel_g33_dmar(int num, int slot, int func)
 {
@@ -128,6 +174,8 @@ static struct chipset early_qrk[] __initdata = {
          PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
        { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
          PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+       { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+         PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
 #ifdef CONFIG_DMAR
        { PCI_VENDOR_ID_INTEL, 0x29c0,
          PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
index ff9e735..34ad997 100644 (file)
@@ -3,11 +3,19 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
 #include <asm/setup.h>
 #include <xen/hvc-console.h>
+#include <asm/pci-direct.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <linux/usb/ehci_def.h>
 
 /* Simple VGA output */
 #define VGABASE                (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8;  /* ttyS0 */
 static int early_serial_putc(unsigned char ch)
 {
        unsigned timeout = 0xffff;
+
        while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
                cpu_relax();
        outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
                if (!strncmp(s, "0x", 2)) {
                        early_serial_base = simple_strtoul(s, &e, 16);
                } else {
-                       static int bases[] = { 0x3f8, 0x2f8 };
+                       static const int __initconst bases[] = { 0x3f8, 0x2f8 };
 
                        if (!strncmp(s, "ttyS", 4))
                                s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
        .index =        -1,
 };
 
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+
+static struct ehci_caps __iomem *ehci_caps;
+static struct ehci_regs __iomem *ehci_regs;
+static struct ehci_dbg_port __iomem *ehci_debug;
+static unsigned int dbgp_endpoint_out;
+
+struct ehci_dev {
+       u32 bus;
+       u32 slot;
+       u32 func;
+};
+
+static struct ehci_dev ehci_dev;
+
+#define USB_DEBUG_DEVNUM 127
+
+#define DBGP_DATA_TOGGLE       0x8800
+
+static inline u32 dbgp_pid_update(u32 x, u32 tok)
+{
+       return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+}
+
+static inline u32 dbgp_len_update(u32 x, u32 len)
+{
+       return (x & ~0x0f) | (len & 0x0f);
+}
+
+/*
+ * USB Packet IDs (PIDs)
+ */
+
+/* token */
+#define USB_PID_OUT            0xe1
+#define USB_PID_IN             0x69
+#define USB_PID_SOF            0xa5
+#define USB_PID_SETUP          0x2d
+/* handshake */
+#define USB_PID_ACK            0xd2
+#define USB_PID_NAK            0x5a
+#define USB_PID_STALL          0x1e
+#define USB_PID_NYET           0x96
+/* data */
+#define USB_PID_DATA0          0xc3
+#define USB_PID_DATA1          0x4b
+#define USB_PID_DATA2          0x87
+#define USB_PID_MDATA          0x0f
+/* Special */
+#define USB_PID_PREAMBLE       0x3c
+#define USB_PID_ERR            0x3c
+#define USB_PID_SPLIT          0x78
+#define USB_PID_PING           0xb4
+#define USB_PID_UNDEF_0                0xf0
+
+#define USB_PID_DATA_TOGGLE    0x88
+#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
+
+#define PCI_CAP_ID_EHCI_DEBUG  0xa
+
+#define HUB_ROOT_RESET_TIME    50      /* times are in msec */
+#define HUB_SHORT_RESET_TIME   10
+#define HUB_LONG_RESET_TIME    200
+#define HUB_RESET_TIMEOUT      500
+
+#define DBGP_MAX_PACKET                8
+
+static int dbgp_wait_until_complete(void)
+{
+       u32 ctrl;
+       int loop = 0x100000;
+
+       do {
+               ctrl = readl(&ehci_debug->control);
+               /* Stop when the transaction is finished */
+               if (ctrl & DBGP_DONE)
+                       break;
+       } while (--loop > 0);
+
+       if (!loop)
+               return -1;
+
+       /*
+        * Now that we have observed the completed transaction,
+        * clear the done bit.
+        */
+       writel(ctrl | DBGP_DONE, &ehci_debug->control);
+       return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
+}
+
+static void dbgp_mdelay(int ms)
+{
+       int i;
+
+       while (ms--) {
+               for (i = 0; i < 1000; i++)
+                       outb(0x1, 0x80);
+       }
+}
+
+static void dbgp_breath(void)
+{
+       /* Sleep to give the debug port a chance to breathe */
+}
+
+static int dbgp_wait_until_done(unsigned ctrl)
+{
+       u32 pids, lpid;
+       int ret;
+       int loop = 3;
+
+retry:
+       writel(ctrl | DBGP_GO, &ehci_debug->control);
+       ret = dbgp_wait_until_complete();
+       pids = readl(&ehci_debug->pids);
+       lpid = DBGP_PID_GET(pids);
+
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If the port is getting full or it has dropped data
+        * start pacing ourselves, not necessary but it's friendly.
+        */
+       if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
+               dbgp_breath();
+
+       /* If I get a NACK reissue the transmission */
+       if (lpid == USB_PID_NAK) {
+               if (--loop > 0)
+                       goto retry;
+       }
+
+       return ret;
+}
+
+static void dbgp_set_data(const void *buf, int size)
+{
+       const unsigned char *bytes = buf;
+       u32 lo, hi;
+       int i;
+
+       lo = hi = 0;
+       for (i = 0; i < 4 && i < size; i++)
+               lo |= bytes[i] << (8*i);
+       for (; i < 8 && i < size; i++)
+               hi |= bytes[i] << (8*(i - 4));
+       writel(lo, &ehci_debug->data03);
+       writel(hi, &ehci_debug->data47);
+}
+
+static void dbgp_get_data(void *buf, int size)
+{
+       unsigned char *bytes = buf;
+       u32 lo, hi;
+       int i;
+
+       lo = readl(&ehci_debug->data03);
+       hi = readl(&ehci_debug->data47);
+       for (i = 0; i < 4 && i < size; i++)
+               bytes[i] = (lo >> (8*i)) & 0xff;
+       for (; i < 8 && i < size; i++)
+               bytes[i] = (hi >> (8*(i - 4))) & 0xff;
+}
+
+static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
+                        const char *bytes, int size)
+{
+       u32 pids, addr, ctrl;
+       int ret;
+
+       if (size > DBGP_MAX_PACKET)
+               return -1;
+
+       addr = DBGP_EPADDR(devnum, endpoint);
+
+       pids = readl(&ehci_debug->pids);
+       pids = dbgp_pid_update(pids, USB_PID_OUT);
+
+       ctrl = readl(&ehci_debug->control);
+       ctrl = dbgp_len_update(ctrl, size);
+       ctrl |= DBGP_OUT;
+       ctrl |= DBGP_GO;
+
+       dbgp_set_data(bytes, size);
+       writel(addr, &ehci_debug->address);
+       writel(pids, &ehci_debug->pids);
+
+       ret = dbgp_wait_until_done(ctrl);
+       if (ret < 0)
+               return ret;
+
+       return ret;
+}
+
+static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+                                int size)
+{
+       u32 pids, addr, ctrl;
+       int ret;
+
+       if (size > DBGP_MAX_PACKET)
+               return -1;
+
+       addr = DBGP_EPADDR(devnum, endpoint);
+
+       pids = readl(&ehci_debug->pids);
+       pids = dbgp_pid_update(pids, USB_PID_IN);
+
+       ctrl = readl(&ehci_debug->control);
+       ctrl = dbgp_len_update(ctrl, size);
+       ctrl &= ~DBGP_OUT;
+       ctrl |= DBGP_GO;
+
+       writel(addr, &ehci_debug->address);
+       writel(pids, &ehci_debug->pids);
+       ret = dbgp_wait_until_done(ctrl);
+       if (ret < 0)
+               return ret;
+
+       if (size > ret)
+               size = ret;
+       dbgp_get_data(data, size);
+       return ret;
+}
+
+static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
+       int value, int index, void *data, int size)
+{
+       u32 pids, addr, ctrl;
+       struct usb_ctrlrequest req;
+       int read;
+       int ret;
+
+       read = (requesttype & USB_DIR_IN) != 0;
+       if (size > (read ? DBGP_MAX_PACKET:0))
+               return -1;
+
+       /* Compute the control message */
+       req.bRequestType = requesttype;
+       req.bRequest = request;
+       req.wValue = cpu_to_le16(value);
+       req.wIndex = cpu_to_le16(index);
+       req.wLength = cpu_to_le16(size);
+
+       pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
+       addr = DBGP_EPADDR(devnum, 0);
+
+       ctrl = readl(&ehci_debug->control);
+       ctrl = dbgp_len_update(ctrl, sizeof(req));
+       ctrl |= DBGP_OUT;
+       ctrl |= DBGP_GO;
+
+       /* Send the setup message */
+       dbgp_set_data(&req, sizeof(req));
+       writel(addr, &ehci_debug->address);
+       writel(pids, &ehci_debug->pids);
+       ret = dbgp_wait_until_done(ctrl);
+       if (ret < 0)
+               return ret;
+
+       /* Read the result */
+       return dbgp_bulk_read(devnum, 0, data, size);
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
+{
+       u8 pos;
+       int bytes;
+
+       if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+               PCI_STATUS_CAP_LIST))
+               return 0;
+
+       pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+               u8 id;
+
+               pos &= ~3;
+               id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+               if (id == 0xff)
+                       break;
+               if (id == cap)
+                       return pos;
+
+               pos = read_pci_config_byte(num, slot, func,
+                                                pos+PCI_CAP_LIST_NEXT);
+       }
+       return 0;
+}
+
+static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
+{
+       u32 class;
+
+       class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+       if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
+               return 0;
+
+       return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
+}
+
+static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
+{
+       u32 bus, slot, func;
+
+       for (bus = 0; bus < 256; bus++) {
+               for (slot = 0; slot < 32; slot++) {
+                       for (func = 0; func < 8; func++) {
+                               unsigned cap;
+
+                               cap = __find_dbgp(bus, slot, func);
+
+                               if (!cap)
+                                       continue;
+                               if (ehci_num-- != 0)
+                                       continue;
+                               *rbus = bus;
+                               *rslot = slot;
+                               *rfunc = func;
+                               return cap;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int ehci_reset_port(int port)
+{
+       u32 portsc;
+       u32 delay_time, delay;
+       int loop;
+
+       /* Reset the usb debug port */
+       portsc = readl(&ehci_regs->port_status[port - 1]);
+       portsc &= ~PORT_PE;
+       portsc |= PORT_RESET;
+       writel(portsc, &ehci_regs->port_status[port - 1]);
+
+       delay = HUB_ROOT_RESET_TIME;
+       for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
+            delay_time += delay) {
+               dbgp_mdelay(delay);
+
+               portsc = readl(&ehci_regs->port_status[port - 1]);
+               if (portsc & PORT_RESET) {
+                       /* force reset to complete */
+                       loop = 2;
+                       writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
+                               &ehci_regs->port_status[port - 1]);
+                       do {
+                               portsc = readl(&ehci_regs->port_status[port-1]);
+                       } while ((portsc & PORT_RESET) && (--loop > 0));
+               }
+
+               /* Device went away? */
+               if (!(portsc & PORT_CONNECT))
+                       return -ENOTCONN;
+
+               /* bomb out completely if something weird happend */
+               if ((portsc & PORT_CSC))
+                       return -EINVAL;
+
+               /* If we've finished resetting, then break out of the loop */
+               if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
+                       return 0;
+       }
+       return -EBUSY;
+}
+
+static int ehci_wait_for_port(int port)
+{
+       u32 status;
+       int ret, reps;
+
+       for (reps = 0; reps < 3; reps++) {
+               dbgp_mdelay(100);
+               status = readl(&ehci_regs->status);
+               if (status & STS_PCD) {
+                       ret = ehci_reset_port(port);
+                       if (ret == 0)
+                               return 0;
+               }
+       }
+       return -ENOTCONN;
+}
+
+#ifdef DBGP_DEBUG
+# define dbgp_printk early_printk
+#else
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
+typedef void (*set_debug_port_t)(int port);
+
+static void default_set_debug_port(int port)
+{
+}
+
+static set_debug_port_t set_debug_port = default_set_debug_port;
+
+static void nvidia_set_debug_port(int port)
+{
+       u32 dword;
+       dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+                                0x74);
+       dword &= ~(0x0f<<12);
+       dword |= ((port & 0x0f)<<12);
+       write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
+                                dword);
+       dbgp_printk("set debug port to %d\n", port);
+}
+
+static void __init detect_set_debug_port(void)
+{
+       u32 vendorid;
+
+       vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+                0x00);
+
+       if ((vendorid & 0xffff) == 0x10de) {
+               dbgp_printk("using nvidia set_debug_port\n");
+               set_debug_port = nvidia_set_debug_port;
+       }
+}
+
+static int __init ehci_setup(void)
+{
+       struct usb_debug_descriptor dbgp_desc;
+       u32 cmd, ctrl, status, portsc, hcs_params;
+       u32 debug_port, new_debug_port = 0, n_ports;
+       u32  devnum;
+       int ret, i;
+       int loop;
+       int port_map_tried;
+       int playtimes = 3;
+
+try_next_time:
+       port_map_tried = 0;
+
+try_next_port:
+
+       hcs_params = readl(&ehci_caps->hcs_params);
+       debug_port = HCS_DEBUG_PORT(hcs_params);
+       n_ports    = HCS_N_PORTS(hcs_params);
+
+       dbgp_printk("debug_port: %d\n", debug_port);
+       dbgp_printk("n_ports:    %d\n", n_ports);
+
+       for (i = 1; i <= n_ports; i++) {
+               portsc = readl(&ehci_regs->port_status[i-1]);
+               dbgp_printk("portstatus%d: %08x\n", i, portsc);
+       }
+
+       if (port_map_tried && (new_debug_port != debug_port)) {
+               if (--playtimes) {
+                       set_debug_port(new_debug_port);
+                       goto try_next_time;
+               }
+               return -1;
+       }
+
+       loop = 10;
+       /* Reset the EHCI controller */
+       cmd = readl(&ehci_regs->command);
+       cmd |= CMD_RESET;
+       writel(cmd, &ehci_regs->command);
+       do {
+               cmd = readl(&ehci_regs->command);
+       } while ((cmd & CMD_RESET) && (--loop > 0));
+
+       if (!loop) {
+               dbgp_printk("can not reset ehci\n");
+               return -1;
+       }
+       dbgp_printk("ehci reset done\n");
+
+       /* Claim ownership, but do not enable yet */
+       ctrl = readl(&ehci_debug->control);
+       ctrl |= DBGP_OWNER;
+       ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+       writel(ctrl, &ehci_debug->control);
+
+       /* Start the ehci running */
+       cmd = readl(&ehci_regs->command);
+       cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+       cmd |= CMD_RUN;
+       writel(cmd, &ehci_regs->command);
+
+       /* Ensure everything is routed to the EHCI */
+       writel(FLAG_CF, &ehci_regs->configured_flag);
+
+       /* Wait until the controller is no longer halted */
+       loop = 10;
+       do {
+               status = readl(&ehci_regs->status);
+       } while ((status & STS_HALT) && (--loop > 0));
+
+       if (!loop) {
+               dbgp_printk("ehci can be started\n");
+               return -1;
+       }
+       dbgp_printk("ehci started\n");
+
+       /* Wait for a device to show up in the debug port */
+       ret = ehci_wait_for_port(debug_port);
+       if (ret < 0) {
+               dbgp_printk("No device found in debug port\n");
+               goto next_debug_port;
+       }
+       dbgp_printk("ehci wait for port done\n");
+
+       /* Enable the debug port */
+       ctrl = readl(&ehci_debug->control);
+       ctrl |= DBGP_CLAIM;
+       writel(ctrl, &ehci_debug->control);
+       ctrl = readl(&ehci_debug->control);
+       if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+               dbgp_printk("No device in debug port\n");
+               writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+               goto err;
+       }
+       dbgp_printk("debug ported enabled\n");
+
+       /* Completely transfer the debug device to the debug controller */
+       portsc = readl(&ehci_regs->port_status[debug_port - 1]);
+       portsc &= ~PORT_PE;
+       writel(portsc, &ehci_regs->port_status[debug_port - 1]);
+
+       dbgp_mdelay(100);
+
+       /* Find the debug device and make it device number 127 */
+       for (devnum = 0; devnum <= 127; devnum++) {
+               ret = dbgp_control_msg(devnum,
+                       USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+                       USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+                       &dbgp_desc, sizeof(dbgp_desc));
+               if (ret > 0)
+                       break;
+       }
+       if (devnum > 127) {
+               dbgp_printk("Could not find attached debug device\n");
+               goto err;
+       }
+       if (ret < 0) {
+               dbgp_printk("Attached device is not a debug device\n");
+               goto err;
+       }
+       dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+       /* Move the device to 127 if it isn't already there */
+       if (devnum != USB_DEBUG_DEVNUM) {
+               ret = dbgp_control_msg(devnum,
+                       USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+                       USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+               if (ret < 0) {
+                       dbgp_printk("Could not move attached device to %d\n",
+                               USB_DEBUG_DEVNUM);
+                       goto err;
+               }
+               devnum = USB_DEBUG_DEVNUM;
+               dbgp_printk("debug device renamed to 127\n");
+       }
+
+       /* Enable the debug interface */
+       ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+               USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+               USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+       if (ret < 0) {
+               dbgp_printk(" Could not enable the debug device\n");
+               goto err;
+       }
+       dbgp_printk("debug interface enabled\n");
+
+       /* Perform a small write to get the even/odd data state in sync
+        */
+       ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+       if (ret < 0) {
+               dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+               goto err;
+       }
+       dbgp_printk("small write doned\n");
+
+       return 0;
+err:
+       /* Things didn't work so remove my claim */
+       ctrl = readl(&ehci_debug->control);
+       ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+       writel(ctrl, &ehci_debug->control);
+       return -1;
+
+next_debug_port:
+       port_map_tried |= (1<<(debug_port - 1));
+       new_debug_port = ((debug_port-1+1)%n_ports) + 1;
+       if (port_map_tried != ((1<<n_ports) - 1)) {
+               set_debug_port(new_debug_port);
+               goto try_next_port;
+       }
+       if (--playtimes) {
+               set_debug_port(new_debug_port);
+               goto try_next_time;
+       }
+
+       return -1;
+}
+
+static int __init early_dbgp_init(char *s)
+{
+       u32 debug_port, bar, offset;
+       u32 bus, slot, func, cap;
+       void __iomem *ehci_bar;
+       u32 dbgp_num;
+       u32 bar_val;
+       char *e;
+       int ret;
+       u8 byte;
+
+       if (!early_pci_allowed())
+               return -1;
+
+       dbgp_num = 0;
+       if (*s)
+               dbgp_num = simple_strtoul(s, &e, 10);
+       dbgp_printk("dbgp_num: %d\n", dbgp_num);
+
+       cap = find_dbgp(dbgp_num, &bus, &slot, &func);
+       if (!cap)
+               return -1;
+
+       dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
+                        func);
+
+       debug_port = read_pci_config(bus, slot, func, cap);
+       bar = (debug_port >> 29) & 0x7;
+       bar = (bar * 4) + 0xc;
+       offset = (debug_port >> 16) & 0xfff;
+       dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
+       if (bar != PCI_BASE_ADDRESS_0) {
+               dbgp_printk("only debug ports on bar 1 handled.\n");
+
+               return -1;
+       }
+
+       bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+       dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
+       if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
+               dbgp_printk("only simple 32bit mmio bars supported\n");
+
+               return -1;
+       }
+
+       /* double check if the mem space is enabled */
+       byte = read_pci_config_byte(bus, slot, func, 0x04);
+       if (!(byte & 0x2)) {
+               byte  |= 0x02;
+               write_pci_config_byte(bus, slot, func, 0x04, byte);
+               dbgp_printk("mmio for ehci enabled\n");
+       }
+
+       /*
+        * FIXME I don't have the bar size so just guess PAGE_SIZE is more
+        * than enough.  1K is the biggest I have seen.
+        */
+       set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+       ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
+       ehci_bar += bar_val & ~PAGE_MASK;
+       dbgp_printk("ehci_bar: %p\n", ehci_bar);
+
+       ehci_caps  = ehci_bar;
+       ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
+       ehci_debug = ehci_bar + offset;
+       ehci_dev.bus = bus;
+       ehci_dev.slot = slot;
+       ehci_dev.func = func;
+
+       detect_set_debug_port();
+
+       ret = ehci_setup();
+       if (ret < 0) {
+               dbgp_printk("ehci_setup failed\n");
+               ehci_debug = NULL;
+
+               return -1;
+       }
+
+       return 0;
+}
+
+static void early_dbgp_write(struct console *con, const char *str, u32 n)
+{
+       int chunk, ret;
+
+       if (!ehci_debug)
+               return;
+       while (n > 0) {
+               chunk = n;
+               if (chunk > DBGP_MAX_PACKET)
+                       chunk = DBGP_MAX_PACKET;
+               ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+                       dbgp_endpoint_out, str, chunk);
+               str += chunk;
+               n -= chunk;
+       }
+}
+
+static struct console early_dbgp_console = {
+       .name =         "earlydbg",
+       .write =        early_dbgp_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+#endif
+
 /* Console interface to a host file on AMD's SimNow! */
 
 static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
 static noinline long simnow(long cmd, long a, long b, long c)
 {
        long ret;
+
        asm volatile("cpuid" :
                     "=a" (ret) :
                     "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
 static void __init simnow_init(char *str)
 {
        char *fn = "klog";
+
        if (*str == '=')
                fn = ++str;
        /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
 
 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
-static int early_console_initialized;
+static int __initdata early_console_initialized;
 
 asmlinkage void early_printk(const char *fmt, ...)
 {
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
        va_end(ap);
 }
 
-static int __initdata keep_early;
 
 static int __init setup_early_printk(char *buf)
 {
+       int keep_early;
+
        if (!buf)
                return 0;
 
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
                return 0;
        early_console_initialized = 1;
 
-       if (strstr(buf, "keep"))
-               keep_early = 1;
+       keep_early = (strstr(buf, "keep") != NULL);
 
        if (!strncmp(buf, "serial", 6)) {
                early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
                simnow_init(buf + 6);
                early_console = &simnow_console;
                keep_early = 1;
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+       } else if (!strncmp(buf, "dbgp", 4)) {
+               if (early_dbgp_init(buf+4) < 0)
+                       return 0;
+               early_console = &early_dbgp_console;
+               /*
+                * usb subsys will reset ehci controller, so don't keep
+                * that early console
+                */
+               keep_early = 0;
+#endif
 #ifdef CONFIG_HVC_XEN
        } else if (!strncmp(buf, "xen", 3)) {
                early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
        register_console(early_console);
        return 0;
 }
+
 early_param("earlyprintk", setup_early_printk);
index 45723f1..1f20608 100644 (file)
@@ -468,9 +468,23 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 
 static int save_i387_xsave(void __user *buf)
 {
+       struct task_struct *tsk = current;
        struct _fpstate_ia32 __user *fx = buf;
        int err = 0;
 
+       /*
+        * For legacy compatible, we always set FP/SSE bits in the bit
+        * vector while saving the state to the user context.
+        * This will enable us capturing any changes(during sigreturn) to
+        * the FP/SSE bits by the legacy applications which don't touch
+        * xstate_bv in the xsave header.
+        *
+        * xsave aware applications can change the xstate_bv in the xsave
+        * header as well as change any contents in the memory layout.
+        * xrestore as part of sigreturn will capture all the changes.
+        */
+       tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
        if (save_i387_fxsave(fx) < 0)
                return -1;
 
index a1bec29..02063ae 100644 (file)
@@ -1281,8 +1281,8 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
        printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
 
        icr = apic_icr_read();
-       printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
-       printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
+       printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+       printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
 
        v = apic_read(APIC_LVTT);
        printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
index 0ed5f93..eee32b4 100644 (file)
@@ -52,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
        memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
               (mincount - oldsize) * LDT_ENTRY_SIZE);
 
+       paravirt_alloc_ldt(newldt, mincount);
+
 #ifdef CONFIG_X86_64
        /* CHECKME: Do we really need this ? */
        wmb();
@@ -74,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 #endif
        }
        if (oldsize) {
+               paravirt_free_ldt(oldldt, oldsize);
                if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
                        vfree(oldldt);
                else
@@ -85,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
 {
        int err = alloc_ldt(new, old->size, 0);
+       int i;
 
        if (err < 0)
                return err;
-       memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+
+       for(i = 0; i < old->size; i++)
+               write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
        return 0;
 }
 
@@ -125,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
                if (mm == current->active_mm)
                        clear_LDT();
 #endif
+               paravirt_free_ldt(mm->context.ldt, mm->context.size);
                if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
                        vfree(mm->context.ldt);
                else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644 (file)
index 652fa5c..0000000
+++ /dev/null
@@ -1,853 +0,0 @@
-/*
- *     Intel CPU Microcode Update Driver for Linux
- *
- *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *                   2006      Shaohua Li <shaohua.li@intel.com>
- *
- *     This driver allows to upgrade microcode on Intel processors
- *     belonging to IA-32 family - PentiumPro, Pentium II,
- *     Pentium III, Xeon, Pentium 4, etc.
- *
- *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *     Software Developer's Manual
- *     Order Number 253668 or free download from:
- *
- *     http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- *     For more information, go to http://www.urbanmyth.org/microcode
- *
- *     This program is free software; you can redistribute it and/or
- *     modify it under the terms of the GNU General Public License
- *     as published by the Free Software Foundation; either version
- *     2 of the License, or (at your option) any later version.
- *
- *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Initial release.
- *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added read() support + cleanups.
- *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Added 'device trimming' support. open(O_WRONLY) zeroes
- *             and frees the saved copy of applied microcode.
- *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *             Made to use devfs (/dev/cpu/microcode) + cleanups.
- *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Added misc device support (now uses both devfs and misc).
- *             Added MICROCODE_IOCFREE ioctl to clear memory.
- *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *             Messages for error cases (non Intel & no suitable microcode).
- *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *             Removed ->release(). Removed exclusive open and status bitmap.
- *             Added microcode_rwsem to serialize read()/write()/ioctl().
- *             Removed global kernel lock usage.
- *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *             Write 0 to 0x8B msr and then cpuid before reading revision,
- *             so that it works even if there were no update done by the
- *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *             to be 0 on my machine which is why it worked even when I
- *             disabled update by the BIOS)
- *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *                          Tigran Aivazian <tigran@veritas.com>
- *             Intel Pentium 4 processor support and bugfixes.
- *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *             Bugfix for HT (Hyper-Threading) enabled processors
- *             whereby processor resources are shared by all logical processors
- *             in a single CPU package.
- *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *             Tigran Aivazian <tigran@veritas.com>,
- *             Serialize updates as required on HT processors due to speculative
- *             nature of implementation.
- *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *             Fix the panic when writing zero-length microcode chunk.
- *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *             Jun Nakajima <jun.nakajima@intel.com>
- *             Support for the microcode updates in the new format.
- *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *             because we no longer hold a copy of applied microcode
- *             in kernel memory.
- *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *             Fix sigmatch() macro to handle old CPUs with pf == 0.
- *             Thanks to Stuart Swales for pointing out this bug.
- */
-
-//#define DEBUG /* pr_debug */
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
-
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION      "1.14a"
-
-#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
-#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
-#define EXT_HEADER_SIZE                (sizeof (struct extended_sigtable)) /* 20 bytes */
-#define EXT_SIGNATURE_SIZE     (sizeof (struct extended_signature)) /* 12 bytes */
-#define DWSIZE                 (sizeof (u32))
-#define get_totalsize(mc) \
-       (((microcode_t *)mc)->hdr.totalsize ? \
-        ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
-       (((microcode_t *)mc)->hdr.datasize ? \
-        ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
-       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DEFINE_MUTEX(microcode_mutex);
-
-static struct ucode_cpu_info {
-       int valid;
-       unsigned int sig;
-       unsigned int pf;
-       unsigned int rev;
-       microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
-
-static void collect_cpu_info(int cpu_num)
-{
-       struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-       unsigned int val[2];
-
-       /* We should bind the task to the CPU */
-       BUG_ON(raw_smp_processor_id() != cpu_num);
-       uci->pf = uci->rev = 0;
-       uci->mc = NULL;
-       uci->valid = 1;
-
-       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-               cpu_has(c, X86_FEATURE_IA64)) {
-               printk(KERN_ERR "microcode: CPU%d not a capable Intel "
-                       "processor\n", cpu_num);
-               uci->valid = 0;
-               return;
-       }
-
-       uci->sig = cpuid_eax(0x00000001);
-
-       if ((c->x86_model >= 5) || (c->x86 > 6)) {
-               /* get processor flags from MSR 0x17 */
-               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-               uci->pf = 1 << ((val[1] >> 18) & 7);
-       }
-
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-       /* see notes above for revision 1.07.  Apparent chip bug */
-       sync_core();
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
-       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       uci->sig, uci->pf, uci->rev);
-}
-
-static inline int microcode_update_match(int cpu_num,
-       microcode_header_t *mc_header, int sig, int pf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       if (!sigmatch(sig, uci->sig, pf, uci->pf)
-               || mc_header->rev <= uci->rev)
-               return 0;
-       return 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
-       microcode_header_t *mc_header = mc;
-       struct extended_sigtable *ext_header = NULL;
-       struct extended_signature *ext_sig;
-       unsigned long total_size, data_size, ext_table_size;
-       int sum, orig_sum, ext_sigcount = 0, i;
-
-       total_size = get_totalsize(mc_header);
-       data_size = get_datasize(mc_header);
-       if (data_size + MC_HEADER_SIZE > total_size) {
-               printk(KERN_ERR "microcode: error! "
-                       "Bad data size in microcode data file\n");
-               return -EINVAL;
-       }
-
-       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-               printk(KERN_ERR "microcode: error! "
-                       "Unknown microcode update format\n");
-               return -EINVAL;
-       }
-       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-       if (ext_table_size) {
-               if ((ext_table_size < EXT_HEADER_SIZE)
-                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-                       printk(KERN_ERR "microcode: error! "
-                               "Small exttable size in microcode data file\n");
-                       return -EINVAL;
-               }
-               ext_header = mc + MC_HEADER_SIZE + data_size;
-               if (ext_table_size != exttable_size(ext_header)) {
-                       printk(KERN_ERR "microcode: error! "
-                               "Bad exttable size in microcode data file\n");
-                       return -EFAULT;
-               }
-               ext_sigcount = ext_header->count;
-       }
-
-       /* check extended table checksum */
-       if (ext_table_size) {
-               int ext_table_sum = 0;
-               int *ext_tablep = (int *)ext_header;
-
-               i = ext_table_size / DWSIZE;
-               while (i--)
-                       ext_table_sum += ext_tablep[i];
-               if (ext_table_sum) {
-                       printk(KERN_WARNING "microcode: aborting, "
-                               "bad extended signature table checksum\n");
-                       return -EINVAL;
-               }
-       }
-
-       /* calculate the checksum */
-       orig_sum = 0;
-       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-       while (i--)
-               orig_sum += ((int *)mc)[i];
-       if (orig_sum) {
-               printk(KERN_ERR "microcode: aborting, bad checksum\n");
-               return -EINVAL;
-       }
-       if (!ext_table_size)
-               return 0;
-       /* check extended signature checksum */
-       for (i = 0; i < ext_sigcount; i++) {
-               ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-                         EXT_SIGNATURE_SIZE * i;
-               sum = orig_sum
-                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
-                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-               if (sum) {
-                       printk(KERN_ERR "microcode: aborting, bad checksum\n");
-                       return -EINVAL;
-               }
-       }
-       return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- * return < 0 - error
- */
-static int get_maching_microcode(void *mc, int cpu)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       microcode_header_t *mc_header = mc;
-       struct extended_sigtable *ext_header;
-       unsigned long total_size = get_totalsize(mc_header);
-       int ext_sigcount, i;
-       struct extended_signature *ext_sig;
-       void *new_mc;
-
-       if (microcode_update_match(cpu, mc_header,
-                       mc_header->sig, mc_header->pf))
-               goto find;
-
-       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
-               return 0;
-
-       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-       ext_sigcount = ext_header->count;
-       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-       for (i = 0; i < ext_sigcount; i++) {
-               if (microcode_update_match(cpu, mc_header,
-                               ext_sig->sig, ext_sig->pf))
-                       goto find;
-               ext_sig++;
-       }
-       return 0;
-find:
-       pr_debug("microcode: CPU%d found a matching microcode update with"
-               " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
-       new_mc = vmalloc(total_size);
-       if (!new_mc) {
-               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-               return -ENOMEM;
-       }
-
-       /* free previous update file */
-       vfree(uci->mc);
-
-       memcpy(new_mc, mc, total_size);
-       uci->mc = new_mc;
-       return 1;
-}
-
-static void apply_microcode(int cpu)
-{
-       unsigned long flags;
-       unsigned int val[2];
-       int cpu_num = raw_smp_processor_id();
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu_num != cpu);
-
-       if (uci->mc == NULL)
-               return;
-
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);
-
-       /* write microcode via MSR 0x79 */
-       wrmsr(MSR_IA32_UCODE_WRITE,
-               (unsigned long) uci->mc->bits,
-               (unsigned long) uci->mc->bits >> 16 >> 16);
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-       /* see notes above for revision 1.07.  Apparent chip bug */
-       sync_core();
-
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
-       if (val[1] != uci->mc->hdr.rev) {
-               printk(KERN_ERR "microcode: CPU%d update from revision "
-                       "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
-               return;
-       }
-       printk(KERN_INFO "microcode: CPU%d updated from revision "
-              "0x%x to 0x%x, date = %08x \n",
-              cpu_num, uci->rev, val[1], uci->mc->hdr.date);
-       uci->rev = val[1];
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static void __user *user_buffer;       /* user area microcode data buffer */
-static unsigned int user_buffer_size;  /* it's size */
-
-static long get_next_ucode(void **mc, long offset)
-{
-       microcode_header_t mc_header;
-       unsigned long total_size;
-
-       /* No more data */
-       if (offset >= user_buffer_size)
-               return 0;
-       if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
-               printk(KERN_ERR "microcode: error! Can not read user data\n");
-               return -EFAULT;
-       }
-       total_size = get_totalsize(&mc_header);
-       if (offset + total_size > user_buffer_size) {
-               printk(KERN_ERR "microcode: error! Bad total size in microcode "
-                               "data file\n");
-               return -EINVAL;
-       }
-       *mc = vmalloc(total_size);
-       if (!*mc)
-               return -ENOMEM;
-       if (copy_from_user(*mc, user_buffer + offset, total_size)) {
-               printk(KERN_ERR "microcode: error! Can not read user data\n");
-               vfree(*mc);
-               return -EFAULT;
-       }
-       return offset + total_size;
-}
-
-static int do_microcode_update (void)
-{
-       long cursor = 0;
-       int error = 0;
-       void *new_mc = NULL;
-       int cpu;
-       cpumask_t old;
-
-       old = current->cpus_allowed;
-
-       while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
-               error = microcode_sanity_check(new_mc);
-               if (error)
-                       goto out;
-               /*
-                * It's possible the data file has multiple matching ucode,
-                * lets keep searching till the latest version
-                */
-               for_each_online_cpu(cpu) {
-                       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-                       if (!uci->valid)
-                               continue;
-                       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-                       error = get_maching_microcode(new_mc, cpu);
-                       if (error < 0)
-                               goto out;
-                       if (error == 1)
-                               apply_microcode(cpu);
-               }
-               vfree(new_mc);
-       }
-out:
-       if (cursor > 0)
-               vfree(new_mc);
-       if (cursor < 0)
-               error = cursor;
-       set_cpus_allowed_ptr(current, &old);
-       return error;
-}
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
-       cycle_kernel_lock();
-       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
-{
-       ssize_t ret;
-
-       if ((len >> PAGE_SHIFT) > num_physpages) {
-               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
-               return -EINVAL;
-       }
-
-       get_online_cpus();
-       mutex_lock(&microcode_mutex);
-
-       user_buffer = (void __user *) buf;
-       user_buffer_size = (int) len;
-
-       ret = do_microcode_update();
-       if (!ret)
-               ret = (ssize_t)len;
-
-       mutex_unlock(&microcode_mutex);
-       put_online_cpus();
-
-       return ret;
-}
-
-static const struct file_operations microcode_fops = {
-       .owner          = THIS_MODULE,
-       .write          = microcode_write,
-       .open           = microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
-       .minor          = MICROCODE_MINOR,
-       .name           = "microcode",
-       .fops           = &microcode_fops,
-};
-
-static int __init microcode_dev_init (void)
-{
-       int error;
-
-       error = misc_register(&microcode_dev);
-       if (error) {
-               printk(KERN_ERR
-                       "microcode: can't misc_register on minor=%d\n",
-                       MICROCODE_MINOR);
-               return error;
-       }
-
-       return 0;
-}
-
-static void microcode_dev_exit (void)
-{
-       misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while(0)
-#endif
-
-static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
-       unsigned long size, long offset)
-{
-       microcode_header_t *mc_header;
-       unsigned long total_size;
-
-       /* No more data */
-       if (offset >= size)
-               return 0;
-       mc_header = (microcode_header_t *)(buf + offset);
-       total_size = get_totalsize(mc_header);
-
-       if (offset + total_size > size) {
-               printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-               return -EINVAL;
-       }
-
-       *mc = vmalloc(total_size);
-       if (!*mc) {
-               printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-               return -ENOMEM;
-       }
-       memcpy(*mc, buf + offset, total_size);
-       return offset + total_size;
-}
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int cpu_request_microcode(int cpu)
-{
-       char name[30];
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-       const struct firmware *firmware;
-       const u8 *buf;
-       unsigned long size;
-       long offset = 0;
-       int error;
-       void *mc;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
-       sprintf(name,"intel-ucode/%02x-%02x-%02x",
-               c->x86, c->x86_model, c->x86_mask);
-       error = request_firmware(&firmware, name, &microcode_pdev->dev);
-       if (error) {
-               pr_debug("microcode: data file %s load failed\n", name);
-               return error;
-       }
-       buf = firmware->data;
-       size = firmware->size;
-       while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
-                       > 0) {
-               error = microcode_sanity_check(mc);
-               if (error)
-                       break;
-               error = get_maching_microcode(mc, cpu);
-               if (error < 0)
-                       break;
-               /*
-                * It's possible the data file has multiple matching ucode,
-                * lets keep searching till the latest version
-                */
-               if (error == 1) {
-                       apply_microcode(cpu);
-                       error = 0;
-               }
-               vfree(mc);
-       }
-       if (offset > 0)
-               vfree(mc);
-       if (offset < 0)
-               error = offset;
-       release_firmware(firmware);
-
-       return error;
-}
-
-static int apply_microcode_check_cpu(int cpu)
-{
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       cpumask_t old;
-       unsigned int val[2];
-       int err = 0;
-
-       /* Check if the microcode is available */
-       if (!uci->mc)
-               return 0;
-
-       old = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
-       /* Check if the microcode we have in memory matches the CPU */
-       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-           cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
-               err = -EINVAL;
-
-       if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
-               /* get processor flags from MSR 0x17 */
-               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-               if (uci->pf != (1 << ((val[1] >> 18) & 7)))
-                       err = -EINVAL;
-       }
-
-       if (!err) {
-               wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-               /* see notes above for revision 1.07.  Apparent chip bug */
-               sync_core();
-               /* get the current revision from MSR 0x8B */
-               rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-               if (uci->rev != val[1])
-                       err = -EINVAL;
-       }
-
-       if (!err)
-               apply_microcode(cpu);
-       else
-               printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
-                       " sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       cpu, uci->sig, uci->pf, uci->rev);
-
-       set_cpus_allowed_ptr(current, &old);
-       return err;
-}
-
-static void microcode_init_cpu(int cpu, int resume)
-{
-       cpumask_t old;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       old = current->cpus_allowed;
-
-       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-       mutex_lock(&microcode_mutex);
-       collect_cpu_info(cpu);
-       if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
-               cpu_request_microcode(cpu);
-       mutex_unlock(&microcode_mutex);
-       set_cpus_allowed_ptr(current, &old);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       mutex_lock(&microcode_mutex);
-       uci->valid = 0;
-       vfree(uci->mc);
-       uci->mc = NULL;
-       mutex_unlock(&microcode_mutex);
-}
-
-static ssize_t reload_store(struct sys_device *dev,
-                           struct sysdev_attribute *attr,
-                           const char *buf, size_t sz)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-       char *end;
-       unsigned long val = simple_strtoul(buf, &end, 0);
-       int err = 0;
-       int cpu = dev->id;
-
-       if (end == buf)
-               return -EINVAL;
-       if (val == 1) {
-               cpumask_t old = current->cpus_allowed;
-
-               get_online_cpus();
-               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
-               mutex_lock(&microcode_mutex);
-               if (uci->valid)
-                       err = cpu_request_microcode(cpu);
-               mutex_unlock(&microcode_mutex);
-               put_online_cpus();
-               set_cpus_allowed_ptr(current, &old);
-       }
-       if (err)
-               return err;
-       return sz;
-}
-
-static ssize_t version_show(struct sys_device *dev,
-                       struct sysdev_attribute *attr, char *buf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-       return sprintf(buf, "0x%x\n", uci->rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev,
-                       struct sysdev_attribute *attr, char *buf)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-       return sprintf(buf, "0x%x\n", uci->pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
-       &attr_reload.attr,
-       &attr_version.attr,
-       &attr_processor_flags.attr,
-       NULL
-};
-
-static struct attribute_group mc_attr_group = {
-       .attrs = mc_default_attrs,
-       .name = "microcode",
-};
-
-static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
-{
-       int err, cpu = sys_dev->id;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-       if (!cpu_online(cpu))
-               return 0;
-
-       pr_debug("microcode: CPU%d added\n", cpu);
-       memset(uci, 0, sizeof(*uci));
-
-       err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
-       if (err)
-               return err;
-
-       microcode_init_cpu(cpu, resume);
-
-       return 0;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
-       return __mc_sysdev_add(sys_dev, 0);
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
-       int cpu = sys_dev->id;
-
-       if (!cpu_online(cpu))
-               return 0;
-
-       pr_debug("microcode: CPU%d removed\n", cpu);
-       microcode_fini_cpu(cpu);
-       sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-       return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
-       int cpu = dev->id;
-
-       if (!cpu_online(cpu))
-               return 0;
-       pr_debug("microcode: CPU%d resumed\n", cpu);
-       /* only CPU 0 will apply ucode here */
-       apply_microcode(0);
-       return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
-       .add = mc_sysdev_add,
-       .remove = mc_sysdev_remove,
-       .resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-       struct sys_device *sys_dev;
-
-       sys_dev = get_cpu_sysdev(cpu);
-       switch (action) {
-       case CPU_UP_CANCELED_FROZEN:
-               /* The CPU refused to come up during a system resume */
-               microcode_fini_cpu(cpu);
-               break;
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-               mc_sysdev_add(sys_dev);
-               break;
-       case CPU_ONLINE_FROZEN:
-               /* System-wide resume is in progress, try to apply microcode */
-               if (apply_microcode_check_cpu(cpu)) {
-                       /* The application of microcode failed */
-                       microcode_fini_cpu(cpu);
-                       __mc_sysdev_add(sys_dev, 1);
-                       break;
-               }
-       case CPU_DOWN_FAILED_FROZEN:
-               if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-                       printk(KERN_ERR "microcode: Failed to create the sysfs "
-                               "group for CPU%d\n", cpu);
-               break;
-       case CPU_DOWN_PREPARE:
-               mc_sysdev_remove(sys_dev);
-               break;
-       case CPU_DOWN_PREPARE_FROZEN:
-               /* Suspend is in progress, only remove the interface */
-               sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
-       .notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init (void)
-{
-       int error;
-
-       printk(KERN_INFO
-               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
-
-       error = microcode_dev_init();
-       if (error)
-               return error;
-       microcode_pdev = platform_device_register_simple("microcode", -1,
-                                                        NULL, 0);
-       if (IS_ERR(microcode_pdev)) {
-               microcode_dev_exit();
-               return PTR_ERR(microcode_pdev);
-       }
-
-       get_online_cpus();
-       error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-       put_online_cpus();
-       if (error) {
-               microcode_dev_exit();
-               platform_device_unregister(microcode_pdev);
-               return error;
-       }
-
-       register_hotcpu_notifier(&mc_cpu_notifier);
-       return 0;
-}
-
-static void __exit microcode_exit (void)
-{
-       microcode_dev_exit();
-
-       unregister_hotcpu_notifier(&mc_cpu_notifier);
-
-       get_online_cpus();
-       sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-       put_online_cpus();
-
-       platform_device_unregister(microcode_pdev);
-}
-
-module_init(microcode_init)
-module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644 (file)
index 0000000..7a1f8ee
--- /dev/null
@@ -0,0 +1,435 @@
+/*
+ *  AMD CPU Microcode Update Driver for Linux
+ *  Copyright (C) 2008 Advanced Micro Devices Inc.
+ *
+ *  Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ *  Based on work by:
+ *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ *  This driver allows to upgrade microcode on AMD
+ *  family 0x10 and 0x11 processors.
+ *
+ *  Licensed unter the terms of the GNU General Public
+ *  License version 2. See file COPYING for details.
+*/
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("AMD Microcode Update Driver");
+MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
+MODULE_LICENSE("GPL v2");
+
+#define UCODE_MAGIC                0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE           0x00000001
+
+struct equiv_cpu_entry {
+       unsigned int installed_cpu;
+       unsigned int fixed_errata_mask;
+       unsigned int fixed_errata_compare;
+       unsigned int equiv_cpu;
+};
+
+struct microcode_header_amd {
+       unsigned int  data_code;
+       unsigned int  patch_id;
+       unsigned char mc_patch_data_id[2];
+       unsigned char mc_patch_data_len;
+       unsigned char init_flag;
+       unsigned int  mc_patch_data_checksum;
+       unsigned int  nb_dev_id;
+       unsigned int  sb_dev_id;
+       unsigned char processor_rev_id[2];
+       unsigned char nb_rev_id;
+       unsigned char sb_rev_id;
+       unsigned char bios_api_rev;
+       unsigned char reserved1[3];
+       unsigned int  match_reg[8];
+};
+
+struct microcode_amd {
+       struct microcode_header_amd hdr;
+       unsigned int mpb[0];
+};
+
+#define UCODE_MAX_SIZE          (2048)
+#define DEFAULT_UCODE_DATASIZE (896)
+#define MC_HEADER_SIZE         (sizeof(struct microcode_header_amd))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define DWSIZE                 (sizeof(u32))
+/* For now we support a fixed ucode total size only */
+#define get_totalsize(mc) \
+       ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
+        + MC_HEADER_SIZE)
+
+/* serialize access to the physical write */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+       memset(csig, 0, sizeof(*csig));
+
+       if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+               printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
+                      cpu);
+               return -1;
+       }
+
+       asm volatile("movl %1, %%ecx; rdmsr"
+                    : "=a" (csig->rev)
+                    : "i" (0x0000008B) : "ecx");
+
+       printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
+               csig->rev);
+
+       return 0;
+}
+
+static int get_matching_microcode(int cpu, void *mc, int rev)
+{
+       struct microcode_header_amd *mc_header = mc;
+       struct pci_dev *nb_pci_dev, *sb_pci_dev;
+       unsigned int current_cpu_id;
+       unsigned int equiv_cpu_id = 0x00;
+       unsigned int i = 0;
+
+       BUG_ON(equiv_cpu_table == NULL);
+       current_cpu_id = cpuid_eax(0x00000001);
+
+       while (equiv_cpu_table[i].installed_cpu != 0) {
+               if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
+                       equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
+                       break;
+               }
+               i++;
+       }
+
+       if (!equiv_cpu_id) {
+               printk(KERN_ERR "microcode: CPU%d cpu_id "
+                      "not found in equivalent cpu table \n", cpu);
+               return 0;
+       }
+
+       if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
+               printk(KERN_ERR
+                       "microcode: CPU%d patch does not match "
+                       "(patch is %x, cpu extended is %x) \n",
+                       cpu, mc_header->processor_rev_id[0],
+                       (equiv_cpu_id & 0xff));
+               return 0;
+       }
+
+       if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
+               printk(KERN_ERR "microcode: CPU%d patch does not match "
+                       "(patch is %x, cpu base id is %x) \n",
+                       cpu, mc_header->processor_rev_id[1],
+                       ((equiv_cpu_id >> 16) & 0xff));
+
+               return 0;
+       }
+
+       /* ucode may be northbridge specific */
+       if (mc_header->nb_dev_id) {
+               nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+                                           (mc_header->nb_dev_id & 0xff),
+                                           NULL);
+               if ((!nb_pci_dev) ||
+                   (mc_header->nb_rev_id != nb_pci_dev->revision)) {
+                       printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
+                       pci_dev_put(nb_pci_dev);
+                       return 0;
+               }
+               pci_dev_put(nb_pci_dev);
+       }
+
+       /* ucode may be southbridge specific */
+       if (mc_header->sb_dev_id) {
+               sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+                                           (mc_header->sb_dev_id & 0xff),
+                                           NULL);
+               if ((!sb_pci_dev) ||
+                   (mc_header->sb_rev_id != sb_pci_dev->revision)) {
+                       printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
+                       pci_dev_put(sb_pci_dev);
+                       return 0;
+               }
+               pci_dev_put(sb_pci_dev);
+       }
+
+       if (mc_header->patch_id <= rev)
+               return 0;
+
+       return 1;
+}
+
+static void apply_microcode_amd(int cpu)
+{
+       unsigned long flags;
+       unsigned int eax, edx;
+       unsigned int rev;
+       int cpu_num = raw_smp_processor_id();
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+       struct microcode_amd *mc_amd = uci->mc;
+       unsigned long addr;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu_num != cpu);
+
+       if (mc_amd == NULL)
+               return;
+
+       spin_lock_irqsave(&microcode_update_lock, flags);
+
+       addr = (unsigned long)&mc_amd->hdr.data_code;
+       edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
+       eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
+
+       asm volatile("movl %0, %%ecx; wrmsr" :
+                    : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
+
+       /* get patch id after patching */
+       asm volatile("movl %1, %%ecx; rdmsr"
+                    : "=a" (rev)
+                    : "i" (0x0000008B) : "ecx");
+
+       spin_unlock_irqrestore(&microcode_update_lock, flags);
+
+       /* check current patch id and patch's id for match */
+       if (rev != mc_amd->hdr.patch_id) {
+               printk(KERN_ERR "microcode: CPU%d update from revision "
+                      "0x%x to 0x%x failed\n", cpu_num,
+                      mc_amd->hdr.patch_id, rev);
+               return;
+       }
+
+       printk(KERN_INFO "microcode: CPU%d updated from revision "
+              "0x%x to 0x%x \n",
+              cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
+
+       uci->cpu_sig.rev = rev;
+}
+
+static void * get_next_ucode(u8 *buf, unsigned int size,
+                       int (*get_ucode_data)(void *, const void *, size_t),
+                       unsigned int *mc_size)
+{
+       unsigned int total_size;
+#define UCODE_CONTAINER_SECTION_HDR    8
+       u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
+       void *mc;
+
+       if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
+               return NULL;
+
+       if (section_hdr[0] != UCODE_UCODE_TYPE) {
+               printk(KERN_ERR "microcode: error! "
+                      "Wrong microcode payload type field\n");
+               return NULL;
+       }
+
+       total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
+
+       printk(KERN_INFO "microcode: size %u, total_size %u\n",
+               size, total_size);
+
+       if (total_size > size || total_size > UCODE_MAX_SIZE) {
+               printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+               return NULL;
+       }
+
+       mc = vmalloc(UCODE_MAX_SIZE);
+       if (mc) {
+               memset(mc, 0, UCODE_MAX_SIZE);
+               if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+                       vfree(mc);
+                       mc = NULL;
+               } else
+                       *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+       }
+#undef UCODE_CONTAINER_SECTION_HDR
+       return mc;
+}
+
+
+static int install_equiv_cpu_table(u8 *buf,
+               int (*get_ucode_data)(void *, const void *, size_t))
+{
+#define UCODE_CONTAINER_HEADER_SIZE    12
+       u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
+       unsigned int *buf_pos = (unsigned int *)container_hdr;
+       unsigned long size;
+
+       if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
+               return 0;
+
+       size = buf_pos[2];
+
+       if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+               printk(KERN_ERR "microcode: error! "
+                      "Wrong microcode equivalnet cpu table\n");
+               return 0;
+       }
+
+       equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+       if (!equiv_cpu_table) {
+               printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
+               return 0;
+       }
+
+       buf += UCODE_CONTAINER_HEADER_SIZE;
+       if (get_ucode_data(equiv_cpu_table, buf, size)) {
+               vfree(equiv_cpu_table);
+               return 0;
+       }
+
+       return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+#undef UCODE_CONTAINER_HEADER_SIZE
+}
+
+static void free_equiv_cpu_table(void)
+{
+       if (equiv_cpu_table) {
+               vfree(equiv_cpu_table);
+               equiv_cpu_table = NULL;
+       }
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+               int (*get_ucode_data)(void *, const void *, size_t))
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+       int new_rev = uci->cpu_sig.rev;
+       unsigned int leftover;
+       unsigned long offset;
+
+       offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
+       if (!offset) {
+               printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+               return -EINVAL;
+       }
+
+       ucode_ptr += offset;
+       leftover = size - offset;
+
+       while (leftover) {
+               unsigned int uninitialized_var(mc_size);
+               struct microcode_header_amd *mc_header;
+
+               mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
+               if (!mc)
+                       break;
+
+               mc_header = (struct microcode_header_amd *)mc;
+               if (get_matching_microcode(cpu, mc, new_rev)) {
+                       if (new_mc)
+                               vfree(new_mc);
+                       new_rev = mc_header->patch_id;
+                       new_mc  = mc;
+               } else 
+                       vfree(mc);
+
+               ucode_ptr += mc_size;
+               leftover  -= mc_size;
+       }
+
+       if (new_mc) {
+               if (!leftover) {
+                       if (uci->mc)
+                               vfree(uci->mc);
+                       uci->mc = new_mc;
+                       pr_debug("microcode: CPU%d found a matching microcode update with"
+                               " version 0x%x (current=0x%x)\n",
+                               cpu, new_rev, uci->cpu_sig.rev);
+               } else
+                       vfree(new_mc);
+       }
+
+       free_equiv_cpu_table();
+
+       return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+       memcpy(to, from, n);
+       return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+       const char *fw_name = "amd-ucode/microcode_amd.bin";
+       const struct firmware *firmware;
+       int ret;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu != raw_smp_processor_id());
+
+       ret = request_firmware(&firmware, fw_name, device);
+       if (ret) {
+               printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
+               return ret;
+       }
+
+       ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+                       &get_ucode_fw);
+
+       release_firmware(firmware);
+
+       return ret;
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+       printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
+                       "is not supported\n");
+       return -1;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       vfree(uci->mc);
+       uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+       .request_microcode_user           = request_microcode_user,
+       .request_microcode_fw             = request_microcode_fw,
+       .collect_cpu_info                 = collect_cpu_info_amd,
+       .apply_microcode                  = apply_microcode_amd,
+       .microcode_fini_cpu               = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+       return &microcode_amd_ops;
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644 (file)
index 0000000..936d8d5
--- /dev/null
@@ -0,0 +1,508 @@
+/*
+ *     Intel CPU Microcode Update Driver for Linux
+ *
+ *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *                   2006      Shaohua Li <shaohua.li@intel.com>
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *     Software Developer's Manual
+ *     Order Number 253668 or free download from:
+ *
+ *     http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ *
+ *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Initial release.
+ *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added read() support + cleanups.
+ *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added 'device trimming' support. open(O_WRONLY) zeroes
+ *             and frees the saved copy of applied microcode.
+ *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Added misc device support (now uses both devfs and misc).
+ *             Added MICROCODE_IOCFREE ioctl to clear memory.
+ *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Messages for error cases (non Intel & no suitable microcode).
+ *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->release(). Removed exclusive open and status bitmap.
+ *             Added microcode_rwsem to serialize read()/write()/ioctl().
+ *             Removed global kernel lock usage.
+ *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Write 0 to 0x8B msr and then cpuid before reading revision,
+ *             so that it works even if there were no update done by the
+ *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *             to be 0 on my machine which is why it worked even when I
+ *             disabled update by the BIOS)
+ *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *                          Tigran Aivazian <tigran@veritas.com>
+ *             Intel Pentium 4 processor support and bugfixes.
+ *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *             Bugfix for HT (Hyper-Threading) enabled processors
+ *             whereby processor resources are shared by all logical processors
+ *             in a single CPU package.
+ *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *             Tigran Aivazian <tigran@veritas.com>,
+ *             Serialize updates as required on HT processors due to
+ *             speculative nature of implementation.
+ *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *             Fix the panic when writing zero-length microcode chunk.
+ *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *             Jun Nakajima <jun.nakajima@intel.com>
+ *             Support for the microcode updates in the new format.
+ *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *             because we no longer hold a copy of applied microcode
+ *             in kernel memory.
+ *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *             Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *             Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION      "2.00"
+
+struct microcode_ops *microcode_ops;
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+EXPORT_SYMBOL_GPL(ucode_cpu_info);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+       cpumask_t old;
+       int error = 0;
+       int cpu;
+
+       old = current->cpus_allowed;
+
+       for_each_online_cpu(cpu) {
+               struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+               if (!uci->valid)
+                       continue;
+
+               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+               error = microcode_ops->request_microcode_user(cpu, buf, size);
+               if (error < 0)
+                       goto out;
+               if (!error)
+                       microcode_ops->apply_microcode(cpu);
+       }
+out:
+       set_cpus_allowed_ptr(current, &old);
+       return error;
+}
+
+static int microcode_open(struct inode *unused1, struct file *unused2)
+{
+       cycle_kernel_lock();
+       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+                              size_t len, loff_t *ppos)
+{
+       ssize_t ret;
+
+       if ((len >> PAGE_SHIFT) > num_physpages) {
+               printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
+                      num_physpages);
+               return -EINVAL;
+       }
+
+       get_online_cpus();
+       mutex_lock(&microcode_mutex);
+
+       ret = do_microcode_update(buf, len);
+       if (!ret)
+               ret = (ssize_t)len;
+
+       mutex_unlock(&microcode_mutex);
+       put_online_cpus();
+
+       return ret;
+}
+
+static const struct file_operations microcode_fops = {
+       .owner          = THIS_MODULE,
+       .write          = microcode_write,
+       .open           = microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+       .minor          = MICROCODE_MINOR,
+       .name           = "microcode",
+       .fops           = &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+       int error;
+
+       error = misc_register(&microcode_dev);
+       if (error) {
+               printk(KERN_ERR
+                       "microcode: can't misc_register on minor=%d\n",
+                       MICROCODE_MINOR);
+               return error;
+       }
+
+       return 0;
+}
+
+static void microcode_dev_exit(void)
+{
+       misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+struct platform_device *microcode_pdev;
+
+static ssize_t reload_store(struct sys_device *dev,
+                           struct sysdev_attribute *attr,
+                           const char *buf, size_t sz)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+       char *end;
+       unsigned long val = simple_strtoul(buf, &end, 0);
+       int err = 0;
+       int cpu = dev->id;
+
+       if (end == buf)
+               return -EINVAL;
+       if (val == 1) {
+               cpumask_t old = current->cpus_allowed;
+
+               get_online_cpus();
+               if (cpu_online(cpu)) {
+                       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+                       mutex_lock(&microcode_mutex);
+                       if (uci->valid) {
+                               err = microcode_ops->request_microcode_fw(cpu,
+                                               &microcode_pdev->dev);
+                               if (!err)
+                                       microcode_ops->apply_microcode(cpu);
+                       }
+                       mutex_unlock(&microcode_mutex);
+                       set_cpus_allowed_ptr(current, &old);
+               }
+               put_online_cpus();
+       }
+       if (err)
+               return err;
+       return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev,
+                       struct sysdev_attribute *attr, char *buf)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+       return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev,
+                       struct sysdev_attribute *attr, char *buf)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+       return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+       &attr_reload.attr,
+       &attr_version.attr,
+       &attr_processor_flags.attr,
+       NULL
+};
+
+static struct attribute_group mc_attr_group = {
+       .attrs = mc_default_attrs,
+       .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       mutex_lock(&microcode_mutex);
+       microcode_ops->microcode_fini_cpu(cpu);
+       uci->valid = 0;
+       mutex_unlock(&microcode_mutex);
+}
+
+static void collect_cpu_info(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       memset(uci, 0, sizeof(*uci));
+       if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
+               uci->valid = 1;
+}
+
+static int microcode_resume_cpu(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       struct cpu_signature nsig;
+
+       pr_debug("microcode: CPU%d resumed\n", cpu);
+
+       if (!uci->mc)
+               return 1;
+
+       /*
+        * Let's verify that the 'cached' ucode does belong
+        * to this cpu (a bit of paranoia):
+        */
+       if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
+               microcode_fini_cpu(cpu);
+               return -1;
+       }
+
+       if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
+               microcode_fini_cpu(cpu);
+               /* Should we look for a new ucode here? */
+               return 1;
+       }
+
+       return 0;
+}
+
+void microcode_update_cpu(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       int err = 0;
+
+       /*
+        * Check if the system resume is in progress (uci->valid != NULL),
+        * otherwise just request a firmware:
+        */
+       if (uci->valid) {
+               err = microcode_resume_cpu(cpu);
+       } else {        
+               collect_cpu_info(cpu);
+               if (uci->valid && system_state == SYSTEM_RUNNING)
+                       err = microcode_ops->request_microcode_fw(cpu,
+                                       &microcode_pdev->dev);
+       }
+       if (!err)
+               microcode_ops->apply_microcode(cpu);
+}
+
+static void microcode_init_cpu(int cpu)
+{
+       cpumask_t old = current->cpus_allowed;
+
+       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+       /* We should bind the task to the CPU */
+       BUG_ON(raw_smp_processor_id() != cpu);
+
+       mutex_lock(&microcode_mutex);
+       microcode_update_cpu(cpu);
+       mutex_unlock(&microcode_mutex);
+
+       set_cpus_allowed_ptr(current, &old);
+}
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+       int err, cpu = sys_dev->id;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       pr_debug("microcode: CPU%d added\n", cpu);
+       memset(uci, 0, sizeof(*uci));
+
+       err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+       if (err)
+               return err;
+
+       microcode_init_cpu(cpu);
+       return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+       int cpu = sys_dev->id;
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       pr_debug("microcode: CPU%d removed\n", cpu);
+       microcode_fini_cpu(cpu);
+       sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+       return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+       int cpu = dev->id;
+
+       if (!cpu_online(cpu))
+               return 0;
+
+       /* only CPU 0 will apply ucode here */
+       microcode_update_cpu(0);
+       return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+       .add = mc_sysdev_add,
+       .remove = mc_sysdev_remove,
+       .resume = mc_sysdev_resume,
+};
+
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct sys_device *sys_dev;
+
+       sys_dev = get_cpu_sysdev(cpu);
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               microcode_init_cpu(cpu);
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+               pr_debug("microcode: CPU%d added\n", cpu);
+               if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+                       printk(KERN_ERR "microcode: Failed to create the sysfs "
+                               "group for CPU%d\n", cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               /* Suspend is in progress, only remove the interface */
+               sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+               pr_debug("microcode: CPU%d removed\n", cpu);
+               break;
+       case CPU_DEAD:
+       case CPU_UP_CANCELED_FROZEN:
+               /* The CPU refused to come up during a system resume */
+               microcode_fini_cpu(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata mc_cpu_notifier = {
+       .notifier_call = mc_cpu_callback,
+};
+
+static int __init microcode_init(void)
+{
+       struct cpuinfo_x86 *c = &cpu_data(0);
+       int error;
+
+       if (c->x86_vendor == X86_VENDOR_INTEL)
+               microcode_ops = init_intel_microcode();
+       else if (c->x86_vendor == X86_VENDOR_AMD)
+               microcode_ops = init_amd_microcode();
+
+       if (!microcode_ops) {
+               printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+               return -ENODEV;
+       }
+
+       error = microcode_dev_init();
+       if (error)
+               return error;
+       microcode_pdev = platform_device_register_simple("microcode", -1,
+                                                        NULL, 0);
+       if (IS_ERR(microcode_pdev)) {
+               microcode_dev_exit();
+               return PTR_ERR(microcode_pdev);
+       }
+
+       get_online_cpus();
+       error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+       put_online_cpus();
+       if (error) {
+               microcode_dev_exit();
+               platform_device_unregister(microcode_pdev);
+               return error;
+       }
+
+       register_hotcpu_notifier(&mc_cpu_notifier);
+
+       printk(KERN_INFO
+              "Microcode Update Driver: v" MICROCODE_VERSION
+              " <tigran@aivazian.fsnet.co.uk>"
+              " <peter.oruba@amd.com>\n");
+
+       return 0;
+}
+
+static void __exit microcode_exit(void)
+{
+       microcode_dev_exit();
+
+       unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+       get_online_cpus();
+       sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+       put_online_cpus();
+
+       platform_device_unregister(microcode_pdev);
+
+       microcode_ops = NULL;
+
+       printk(KERN_INFO
+              "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+
+module_init(microcode_init);
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644 (file)
index 0000000..622dc4a
--- /dev/null
@@ -0,0 +1,480 @@
+/*
+ *     Intel CPU Microcode Update Driver for Linux
+ *
+ *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *                   2006      Shaohua Li <shaohua.li@intel.com>
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *     Software Developer's Manual
+ *     Order Number 253668 or free download from:
+ *
+ *     http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ *
+ *     1.0     16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Initial release.
+ *     1.01    18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added read() support + cleanups.
+ *     1.02    21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Added 'device trimming' support. open(O_WRONLY) zeroes
+ *             and frees the saved copy of applied microcode.
+ *     1.03    29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *             Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *     1.04    06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Added misc device support (now uses both devfs and misc).
+ *             Added MICROCODE_IOCFREE ioctl to clear memory.
+ *     1.05    09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *             Messages for error cases (non Intel & no suitable microcode).
+ *     1.06    03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->release(). Removed exclusive open and status bitmap.
+ *             Added microcode_rwsem to serialize read()/write()/ioctl().
+ *             Removed global kernel lock usage.
+ *     1.07    07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *             Write 0 to 0x8B msr and then cpuid before reading revision,
+ *             so that it works even if there were no update done by the
+ *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *             to be 0 on my machine which is why it worked even when I
+ *             disabled update by the BIOS)
+ *             Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *                          Tigran Aivazian <tigran@veritas.com>
+ *             Intel Pentium 4 processor support and bugfixes.
+ *     1.09    30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *             Bugfix for HT (Hyper-Threading) enabled processors
+ *             whereby processor resources are shared by all logical processors
+ *             in a single CPU package.
+ *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *             Tigran Aivazian <tigran@veritas.com>,
+ *             Serialize updates as required on HT processors due to
+ *             speculative nature of implementation.
+ *     1.11    22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *             Fix the panic when writing zero-length microcode chunk.
+ *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *             Jun Nakajima <jun.nakajima@intel.com>
+ *             Support for the microcode updates in the new format.
+ *     1.13    10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *             because we no longer hold a copy of applied microcode
+ *             in kernel memory.
+ *     1.14    25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *             Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *             Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+struct microcode_header_intel {
+       unsigned int            hdrver;
+       unsigned int            rev;
+       unsigned int            date;
+       unsigned int            sig;
+       unsigned int            cksum;
+       unsigned int            ldrver;
+       unsigned int            pf;
+       unsigned int            datasize;
+       unsigned int            totalsize;
+       unsigned int            reserved[3];
+};
+
+struct microcode_intel {
+       struct microcode_header_intel hdr;
+       unsigned int            bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+       unsigned int            sig;
+       unsigned int            pf;
+       unsigned int            cksum;
+};
+
+struct extended_sigtable {
+       unsigned int            count;
+       unsigned int            cksum;
+       unsigned int            reserved[3];
+       struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE         (2000)
+#define MC_HEADER_SIZE         (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE                (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE     (sizeof(struct extended_signature))
+#define DWSIZE                 (sizeof(u32))
+#define get_totalsize(mc) \
+       (((struct microcode_intel *)mc)->hdr.totalsize ? \
+        ((struct microcode_intel *)mc)->hdr.totalsize : \
+        DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+       (((struct microcode_intel *)mc)->hdr.datasize ? \
+        ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+       struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+       unsigned int val[2];
+
+       memset(csig, 0, sizeof(*csig));
+
+       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+           cpu_has(c, X86_FEATURE_IA64)) {
+               printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+                       "processor\n", cpu_num);
+               return -1;
+       }
+
+       csig->sig = cpuid_eax(0x00000001);
+
+       if ((c->x86_model >= 5) || (c->x86 > 6)) {
+               /* get processor flags from MSR 0x17 */
+               rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+               csig->pf = 1 << ((val[1] >> 18) & 7);
+       }
+
+       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+       /* see notes above for revision 1.07.  Apparent chip bug */
+       sync_core();
+       /* get the current revision from MSR 0x8B */
+       rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+                       csig->sig, csig->pf, csig->rev);
+
+       return 0;
+}
+
+static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
+{
+       return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
+}
+
+static inline int 
+update_match_revision(struct microcode_header_intel *mc_header,        int rev)
+{
+       return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+static int microcode_sanity_check(void *mc)
+{
+       struct microcode_header_intel *mc_header = mc;
+       struct extended_sigtable *ext_header = NULL;
+       struct extended_signature *ext_sig;
+       unsigned long total_size, data_size, ext_table_size;
+       int sum, orig_sum, ext_sigcount = 0, i;
+
+       total_size = get_totalsize(mc_header);
+       data_size = get_datasize(mc_header);
+       if (data_size + MC_HEADER_SIZE > total_size) {
+               printk(KERN_ERR "microcode: error! "
+                       "Bad data size in microcode data file\n");
+               return -EINVAL;
+       }
+
+       if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+               printk(KERN_ERR "microcode: error! "
+                       "Unknown microcode update format\n");
+               return -EINVAL;
+       }
+       ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+       if (ext_table_size) {
+               if ((ext_table_size < EXT_HEADER_SIZE)
+                || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+                       printk(KERN_ERR "microcode: error! "
+                               "Small exttable size in microcode data file\n");
+                       return -EINVAL;
+               }
+               ext_header = mc + MC_HEADER_SIZE + data_size;
+               if (ext_table_size != exttable_size(ext_header)) {
+                       printk(KERN_ERR "microcode: error! "
+                               "Bad exttable size in microcode data file\n");
+                       return -EFAULT;
+               }
+               ext_sigcount = ext_header->count;
+       }
+
+       /* check extended table checksum */
+       if (ext_table_size) {
+               int ext_table_sum = 0;
+               int *ext_tablep = (int *)ext_header;
+
+               i = ext_table_size / DWSIZE;
+               while (i--)
+                       ext_table_sum += ext_tablep[i];
+               if (ext_table_sum) {
+                       printk(KERN_WARNING "microcode: aborting, "
+                               "bad extended signature table checksum\n");
+                       return -EINVAL;
+               }
+       }
+
+       /* calculate the checksum */
+       orig_sum = 0;
+       i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+       while (i--)
+               orig_sum += ((int *)mc)[i];
+       if (orig_sum) {
+               printk(KERN_ERR "microcode: aborting, bad checksum\n");
+               return -EINVAL;
+       }
+       if (!ext_table_size)
+               return 0;
+       /* check extended signature checksum */
+       for (i = 0; i < ext_sigcount; i++) {
+               ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+                         EXT_SIGNATURE_SIZE * i;
+               sum = orig_sum
+                       - (mc_header->sig + mc_header->pf + mc_header->cksum)
+                       + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+               if (sum) {
+                       printk(KERN_ERR "microcode: aborting, bad checksum\n");
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+static int
+get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+{
+       struct microcode_header_intel *mc_header = mc;
+       struct extended_sigtable *ext_header;
+       unsigned long total_size = get_totalsize(mc_header);
+       int ext_sigcount, i;
+       struct extended_signature *ext_sig;
+
+       if (!update_match_revision(mc_header, rev))
+               return 0;
+
+       if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
+               return 1;
+
+       /* Look for ext. headers: */
+       if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+               return 0;
+
+       ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+       ext_sigcount = ext_header->count;
+       ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+       for (i = 0; i < ext_sigcount; i++) {
+               if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
+                       return 1;
+               ext_sig++;
+       }
+       return 0;
+}
+
+static void apply_microcode(int cpu)
+{
+       unsigned long flags;
+       unsigned int val[2];
+       int cpu_num = raw_smp_processor_id();
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       struct microcode_intel *mc_intel = uci->mc;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu_num != cpu);
+
+       if (mc_intel == NULL)
+               return;
+
+       /* serialize access to the physical write to MSR 0x79 */
+       spin_lock_irqsave(&microcode_update_lock, flags);
+
+       /* write microcode via MSR 0x79 */
+       wrmsr(MSR_IA32_UCODE_WRITE,
+             (unsigned long) mc_intel->bits,
+             (unsigned long) mc_intel->bits >> 16 >> 16);
+       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+       /* see notes above for revision 1.07.  Apparent chip bug */
+       sync_core();
+
+       /* get the current revision from MSR 0x8B */
+       rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+       spin_unlock_irqrestore(&microcode_update_lock, flags);
+       if (val[1] != mc_intel->hdr.rev) {
+               printk(KERN_ERR "microcode: CPU%d update from revision "
+                       "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
+               return;
+       }
+       printk(KERN_INFO "microcode: CPU%d updated from revision "
+              "0x%x to 0x%x, date = %04x-%02x-%02x \n",
+               cpu_num, uci->cpu_sig.rev, val[1],
+               mc_intel->hdr.date & 0xffff,
+               mc_intel->hdr.date >> 24,
+               (mc_intel->hdr.date >> 16) & 0xff);
+       uci->cpu_sig.rev = val[1];
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+               int (*get_ucode_data)(void *, const void *, size_t))
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+       int new_rev = uci->cpu_sig.rev;
+       unsigned int leftover = size;
+
+       while (leftover) {
+               struct microcode_header_intel mc_header;
+               unsigned int mc_size;
+
+               if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+                       break;
+
+               mc_size = get_totalsize(&mc_header);
+               if (!mc_size || mc_size > leftover) {
+                       printk(KERN_ERR "microcode: error!"
+                                       "Bad data in microcode data file\n");
+                       break;
+               }
+
+               mc = vmalloc(mc_size);
+               if (!mc)
+                       break;
+
+               if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+                   microcode_sanity_check(mc) < 0) {
+                       vfree(mc);
+                       break;
+               }
+
+               if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+                       if (new_mc)
+                               vfree(new_mc);
+                       new_rev = mc_header.rev;
+                       new_mc  = mc;
+               } else
+                       vfree(mc);
+
+               ucode_ptr += mc_size;
+               leftover  -= mc_size;
+       }
+
+       if (new_mc) {
+               if (!leftover) {
+                       if (uci->mc)
+                               vfree(uci->mc);
+                       uci->mc = (struct microcode_intel *)new_mc;
+                       pr_debug("microcode: CPU%d found a matching microcode update with"
+                                " version 0x%x (current=0x%x)\n",
+                               cpu, new_rev, uci->cpu_sig.rev);
+               } else
+                       vfree(new_mc);
+       }
+
+       return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+       memcpy(to, from, n);
+       return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+       char name[30];
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       const struct firmware *firmware;
+       int ret;
+
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu != raw_smp_processor_id());
+       sprintf(name, "intel-ucode/%02x-%02x-%02x",
+               c->x86, c->x86_model, c->x86_mask);
+       ret = request_firmware(&firmware, name, device);
+       if (ret) {
+               pr_debug("microcode: data file %s load failed\n", name);
+               return ret;
+       }
+
+       ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+                       &get_ucode_fw);
+
+       release_firmware(firmware);
+
+       return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+       return copy_from_user(to, from, n);
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+       /* We should bind the task to the CPU */
+       BUG_ON(cpu != raw_smp_processor_id());
+
+       return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+       vfree(uci->mc);
+       uci->mc = NULL;
+}
+
+struct microcode_ops microcode_intel_ops = {
+       .request_microcode_user           = request_microcode_user,
+       .request_microcode_fw             = request_microcode_fw,
+       .collect_cpu_info                 = collect_cpu_info,
+       .apply_microcode                  = apply_microcode,
+       .microcode_fini_cpu               = microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+       return &microcode_intel_ops;
+}
+
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644 (file)
index 0000000..0e9f198
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/paravirt.h>
+
+static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+{
+       __raw_spin_lock(lock);
+}
+
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+       .spin_is_locked = __ticket_spin_is_locked,
+       .spin_is_contended = __ticket_spin_is_contended,
+
+       .spin_lock = __ticket_spin_lock,
+       .spin_lock_flags = default_spin_lock_flags,
+       .spin_trylock = __ticket_spin_trylock,
+       .spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL(pv_lock_ops);
+
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+       pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+       pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+       pv_lock_ops.spin_lock = __byte_spin_lock;
+       pv_lock_ops.spin_trylock = __byte_spin_trylock;
+       pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
index 6b0bb73..e4c8fb6 100644 (file)
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
        return __get_cpu_var(paravirt_lazy_mode);
 }
 
-void __init paravirt_use_bytelocks(void)
-{
-#ifdef CONFIG_SMP
-       pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
-       pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
-       pv_lock_ops.spin_lock = __byte_spin_lock;
-       pv_lock_ops.spin_trylock = __byte_spin_trylock;
-       pv_lock_ops.spin_unlock = __byte_spin_unlock;
-#endif
-}
-
 struct pv_info pv_info = {
        .name = "bare hardware",
        .paravirt_enabled = 0,
@@ -349,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
        .write_ldt_entry = native_write_ldt_entry,
        .write_gdt_entry = native_write_gdt_entry,
        .write_idt_entry = native_write_idt_entry,
+
+       .alloc_ldt = paravirt_nop,
+       .free_ldt = paravirt_nop,
+
        .load_sp0 = native_load_sp0,
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -460,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
        .set_fixmap = native_set_fixmap,
 };
 
-struct pv_lock_ops pv_lock_ops = {
-#ifdef CONFIG_SMP
-       .spin_is_locked = __ticket_spin_is_locked,
-       .spin_is_contended = __ticket_spin_is_contended,
-
-       .spin_lock = __ticket_spin_lock,
-       .spin_trylock = __ticket_spin_trylock,
-       .spin_unlock = __ticket_spin_unlock,
-#endif
-};
-EXPORT_SYMBOL(pv_lock_ops);
-
 EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL    (pv_cpu_ops);
 EXPORT_SYMBOL    (pv_mmu_ops);
index 205188d..922c140 100644 (file)
@@ -76,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
        return ((unsigned long *)tsk->thread.sp)[3];
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-#include <asm/nmi.h>
-
-static void cpu_exit_clear(void)
-{
-       int cpu = raw_smp_processor_id();
-
-       idle_task_exit();
-
-       cpu_uninit();
-       irq_ctx_exit(cpu);
-
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
-
-       numa_remove_cpu(cpu);
-       c1e_remove_cpu(cpu);
-}
-
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void play_dead(void)
-{
-       /* This must be done before dead CPU ack */
-       cpu_exit_clear();
-       mb();
-       /* Ack it */
-       __get_cpu_var(cpu_state) = CPU_DEAD;
-
-       /*
-        * With physical CPU hotplug, we should halt the cpu
-        */
-       local_irq_disable();
-       /* mask all interrupts, flush any and all caches, and halt */
-       wbinvd_halt();
-}
-#else
+#ifndef CONFIG_SMP
 static inline void play_dead(void)
 {
        BUG();
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
 
 /*
  * The idle thread. There's no useful work to be
index 2a8ccb9..ca80394 100644 (file)
@@ -86,30 +86,12 @@ void exit_idle(void)
        __exit_idle();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-DECLARE_PER_CPU(int, cpu_state);
-
-#include <linux/nmi.h>
-/* We halt the CPU with physical CPU hotplug */
-static inline void play_dead(void)
-{
-       idle_task_exit();
-       c1e_remove_cpu(raw_smp_processor_id());
-
-       mb();
-       /* Ack it */
-       __get_cpu_var(cpu_state) = CPU_DEAD;
-
-       local_irq_disable();
-       /* mask all interrupts, flush any and all caches, and halt */
-       wbinvd_halt();
-}
-#else
+#ifndef CONFIG_SMP
 static inline void play_dead(void)
 {
        BUG();
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
 
 /*
  * The idle thread. There's no useful work to be
@@ -754,12 +736,12 @@ unsigned long get_wchan(struct task_struct *p)
        if (!p || p == current || p->state == TASK_RUNNING)
                return 0;
        stack = (unsigned long)task_stack_page(p);
-       if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
+       if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
                return 0;
        fp = *(u64 *)(p->thread.sp);
        do {
                if (fp < (unsigned long)stack ||
-                   fp > (unsigned long)stack+THREAD_SIZE)
+                   fp >= (unsigned long)stack+THREAD_SIZE)
                        return 0;
                ip = *(u64 *)(fp+8);
                if (!in_sched_functions(ip))
index 4e1ef66..0a6d8c1 100644 (file)
@@ -1489,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 #endif
 }
 
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+                                        int error_code, int si_code)
 {
        struct siginfo info;
 
@@ -1498,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 
        memset(&info, 0, sizeof(info));
        info.si_signo = SIGTRAP;
-       info.si_code = TRAP_BRKPT;
+       info.si_code = si_code;
 
        /* User-mode ip? */
        info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1585,5 +1586,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
         */
        if (test_thread_flag(TIF_SINGLESTEP) &&
            tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
-               send_sigtrap(current, regs, 0);
+               send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
index 46c98ef..21b8e0a 100644 (file)
@@ -582,6 +582,190 @@ static struct x86_quirks default_x86_quirks __initdata;
 struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
 
 /*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable.  Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+
+static int set_corruption_check(char *arg)
+{
+       char *end;
+
+       memory_corruption_check = simple_strtol(arg, &end, 10);
+
+       return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static int set_corruption_check_period(char *arg)
+{
+       char *end;
+
+       corruption_check_period = simple_strtoul(arg, &end, 10);
+
+       return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static int set_corruption_check_size(char *arg)
+{
+       char *end;
+       unsigned size;
+
+       size = memparse(arg, &end);
+
+       if (*end == '\0')
+               corruption_check_size = size;
+
+       return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+static void __init setup_bios_corruption_check(void)
+{
+       u64 addr = PAGE_SIZE;   /* assume first page is reserved anyway */
+
+       if (memory_corruption_check == -1) {
+               memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+                       1
+#else
+                       0
+#endif
+                       ;
+       }
+
+       if (corruption_check_size == 0)
+               memory_corruption_check = 0;
+
+       if (!memory_corruption_check)
+               return;
+
+       corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+       while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
+               u64 size;
+               addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+
+               if (addr == 0)
+                       break;
+
+               if ((addr + size) > corruption_check_size)
+                       size = corruption_check_size - addr;
+
+               if (size == 0)
+                       break;
+
+               e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+               scan_areas[num_scan_areas].addr = addr;
+               scan_areas[num_scan_areas].size = size;
+               num_scan_areas++;
+
+               /* Assume we've already mapped this early memory */
+               memset(__va(addr), 0, size);
+
+               addr += size;
+       }
+
+       printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+              num_scan_areas);
+       update_e820();
+}
+
+static struct timer_list periodic_check_timer;
+
+void check_for_bios_corruption(void)
+{
+       int i;
+       int corruption = 0;
+
+       if (!memory_corruption_check)
+               return;
+
+       for(i = 0; i < num_scan_areas; i++) {
+               unsigned long *addr = __va(scan_areas[i].addr);
+               unsigned long size = scan_areas[i].size;
+
+               for(; size; addr++, size -= sizeof(unsigned long)) {
+                       if (!*addr)
+                               continue;
+                       printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+                              addr, __pa(addr), *addr);
+                       corruption = 1;
+                       *addr = 0;
+               }
+       }
+
+       WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void periodic_check_for_corruption(unsigned long data)
+{
+       check_for_bios_corruption();
+       mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
+}
+
+void start_periodic_check_for_corruption(void)
+{
+       if (!memory_corruption_check || corruption_check_period == 0)
+               return;
+
+       printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+              corruption_check_period);
+
+       init_timer(&periodic_check_timer);
+       periodic_check_timer.function = &periodic_check_for_corruption;
+       periodic_check_for_corruption(0);
+}
+#endif
+
+static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
+{
+       printk(KERN_NOTICE
+               "%s detected: BIOS may corrupt low RAM, working it around.\n",
+               d->ident);
+
+       e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+       return 0;
+}
+
+/* List of systems that have known low memory corruption BIOS problems */
+static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
+#ifdef CONFIG_X86_RESERVE_LOW_64K
+       {
+               .callback = dmi_low_memory_corruption,
+               .ident = "AMI BIOS",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+               },
+       },
+       {
+               .callback = dmi_low_memory_corruption,
+               .ident = "Phoenix BIOS",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+               },
+       },
+#endif
+       {}
+};
+
+/*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
  * for initialization.  Note, the efi init code path is determined by the
@@ -715,6 +899,10 @@ void __init setup_arch(char **cmdline_p)
 
        finish_e820_parsing();
 
+       dmi_scan_machine();
+
+       dmi_check_system(bad_bios_dmi_table);
+
 #ifdef CONFIG_X86_32
        probe_roms();
 #endif
@@ -771,6 +959,10 @@ void __init setup_arch(char **cmdline_p)
        high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+       setup_bios_corruption_check();
+#endif
+
        /* max_pfn_mapped is updated here */
        max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
        max_pfn_mapped = max_low_pfn_mapped;
@@ -799,8 +991,6 @@ void __init setup_arch(char **cmdline_p)
        vsmp_init();
 #endif
 
-       dmi_scan_machine();
-
        io_delay_init();
 
        /*
@@ -903,3 +1093,5 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 }
+
+
index b21070e..d6dd057 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/vdso.h>
+#include <asm/syscall.h>
 #include <asm/syscalls.h>
 
 #include "sigframe.h"
@@ -112,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
        return do_sigaltstack(uss, uoss, regs->sp);
 }
 
+#define COPY(x)                        {               \
+       err |= __get_user(regs->x, &sc->x);     \
+}
+
+#define COPY_SEG(seg)          {                       \
+               unsigned short tmp;                     \
+               err |= __get_user(tmp, &sc->seg);       \
+               regs->seg = tmp;                        \
+}
+
+#define COPY_SEG_STRICT(seg)   {                       \
+               unsigned short tmp;                     \
+               err |= __get_user(tmp, &sc->seg);       \
+               regs->seg = tmp | 3;                    \
+}
+
+#define GET_SEG(seg)           {                       \
+               unsigned short tmp;                     \
+               err |= __get_user(tmp, &sc->seg);       \
+               loadsegment(seg, tmp);                  \
+}
 
 /*
  * Do a signal return; undo the signal stack.
@@ -120,28 +142,13 @@ static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
                   unsigned long *pax)
 {
+       void __user *buf;
+       unsigned int tmpflags;
        unsigned int err = 0;
 
        /* Always make any pending restarted system calls return -EINTR */
        current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)                err |= __get_user(regs->x, &sc->x)
-
-#define COPY_SEG(seg)                                                  \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         regs->seg = tmp; }
-
-#define COPY_SEG_STRICT(seg)                                           \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         regs->seg = tmp|3; }
-
-#define GET_SEG(seg)                                                   \
-       { unsigned short tmp;                                           \
-         err |= __get_user(tmp, &sc->seg);                             \
-         loadsegment(seg, tmp); }
-
        GET_SEG(gs);
        COPY_SEG(fs);
        COPY_SEG(es);
@@ -151,21 +158,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
        COPY_SEG_STRICT(cs);
        COPY_SEG_STRICT(ss);
 
-       {
-               unsigned int tmpflags;
-
-               err |= __get_user(tmpflags, &sc->flags);
-               regs->flags = (regs->flags & ~FIX_EFLAGS) |
-                                               (tmpflags & FIX_EFLAGS);
-               regs->orig_ax = -1;             /* disable syscall checks */
-       }
-
-       {
-               void __user *buf;
+       err |= __get_user(tmpflags, &sc->flags);
+       regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+       regs->orig_ax = -1;             /* disable syscall checks */
 
-               err |= __get_user(buf, &sc->fpstate);
-               err |= restore_i387_xstate(buf);
-       }
+       err |= __get_user(buf, &sc->fpstate);
+       err |= restore_i387_xstate(buf);
 
        err |= __get_user(*pax, &sc->ax);
        return err;
@@ -214,9 +212,8 @@ badframe:
        return 0;
 }
 
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+static long do_rt_sigreturn(struct pt_regs *regs)
 {
-       struct pt_regs *regs = (struct pt_regs *)&__unused;
        struct rt_sigframe __user *frame;
        unsigned long ax;
        sigset_t set;
@@ -242,10 +239,17 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
        return ax;
 
 badframe:
-       force_sig(SIGSEGV, current);
+       signal_fault(regs, frame, "rt_sigreturn");
        return 0;
 }
 
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+       struct pt_regs *regs = (struct pt_regs *)&__unused;
+
+       return do_rt_sigreturn(regs);
+}
+
 /*
  * Set up a signal frame.
  */
@@ -337,39 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 }
 
 static int
-setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
-           struct pt_regs *regs)
+__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+             struct pt_regs *regs)
 {
        struct sigframe __user *frame;
        void __user *restorer;
        int err = 0;
-       int usig;
        void __user *fpstate = NULL;
 
        frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
+               return -EFAULT;
 
-       usig = current_thread_info()->exec_domain
-               && current_thread_info()->exec_domain->signal_invmap
-               && sig < 32
-               ? current_thread_info()->exec_domain->signal_invmap[sig]
-               : sig;
+       if (__put_user(sig, &frame->sig))
+               return -EFAULT;
 
-       err = __put_user(usig, &frame->sig);
-       if (err)
-               goto give_sigsegv;
-
-       err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]);
-       if (err)
-               goto give_sigsegv;
+       if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+               return -EFAULT;
 
        if (_NSIG_WORDS > 1) {
-               err = __copy_to_user(&frame->extramask, &set->sig[1],
-                                     sizeof(frame->extramask));
-               if (err)
-                       goto give_sigsegv;
+               if (__copy_to_user(&frame->extramask, &set->sig[1],
+                                  sizeof(frame->extramask)))
+                       return -EFAULT;
        }
 
        if (current->mm->context.vdso)
@@ -394,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
        err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
 
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up registers for signal handler */
        regs->sp = (unsigned long)frame;
@@ -409,38 +403,27 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
        regs->cs = __USER_CS;
 
        return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
 }
 
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                         sigset_t *set, struct pt_regs *regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                           sigset_t *set, struct pt_regs *regs)
 {
        struct rt_sigframe __user *frame;
        void __user *restorer;
        int err = 0;
-       int usig;
        void __user *fpstate = NULL;
 
        frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
-
-       usig = current_thread_info()->exec_domain
-               && current_thread_info()->exec_domain->signal_invmap
-               && sig < 32
-               ? current_thread_info()->exec_domain->signal_invmap[sig]
-               : sig;
+               return -EFAULT;
 
-       err |= __put_user(usig, &frame->sig);
+       err |= __put_user(sig, &frame->sig);
        err |= __put_user(&frame->info, &frame->pinfo);
        err |= __put_user(&frame->uc, &frame->puc);
        err |= copy_siginfo_to_user(&frame->info, info);
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Create the ucontext.  */
        if (cpu_has_xsave)
@@ -456,7 +439,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                                regs, set->sig[0]);
        err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up to return from userspace.  */
        restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -476,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
        err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
 
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up registers for signal handler */
        regs->sp = (unsigned long)frame;
        regs->ip = (unsigned long)ka->sa.sa_handler;
-       regs->ax = (unsigned long)usig;
+       regs->ax = (unsigned long)sig;
        regs->dx = (unsigned long)&frame->info;
        regs->cx = (unsigned long)&frame->uc;
 
@@ -491,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
        regs->cs = __USER_CS;
 
        return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
 }
 
 /*
  * OK, we're invoking a handler:
  */
+static int signr_convert(int sig)
+{
+       struct thread_info *info = current_thread_info();
+
+       if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
+               return info->exec_domain->signal_invmap[sig];
+       return sig;
+}
+
+#define is_ia32        1
+#define ia32_setup_frame       __setup_frame
+#define ia32_setup_rt_frame    __setup_rt_frame
+
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+              sigset_t *set, struct pt_regs *regs)
+{
+       int usig = signr_convert(sig);
+       int ret;
+
+       /* Set up the stack frame */
+       if (is_ia32) {
+               if (ka->sa.sa_flags & SA_SIGINFO)
+                       ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+               else
+                       ret = ia32_setup_frame(usig, ka, set, regs);
+       } else
+               ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+       if (ret) {
+               force_sigsegv(sig, current);
+               return -EFAULT;
+       }
+
+       return ret;
+}
+
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
              sigset_t *oldset, struct pt_regs *regs)
@@ -507,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
        int ret;
 
        /* Are we from a system call? */
-       if ((long)regs->orig_ax >= 0) {
+       if (syscall_get_nr(current, regs) >= 0) {
                /* If so, check system call restarting.. */
-               switch (regs->ax) {
+               switch (syscall_get_error(current, regs)) {
                case -ERESTART_RESTARTBLOCK:
                case -ERESTARTNOHAND:
                        regs->ax = -EINTR;
@@ -536,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
            likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
                regs->flags &= ~X86_EFLAGS_TF;
 
-       /* Set up the stack frame */
-       if (ka->sa.sa_flags & SA_SIGINFO)
-               ret = setup_rt_frame(sig, ka, info, oldset, regs);
-       else
-               ret = setup_frame(sig, ka, oldset, regs);
+       ret = setup_rt_frame(sig, ka, info, oldset, regs);
 
        if (ret)
                return ret;
 
+#ifdef CONFIG_X86_64
+       /*
+        * This has nothing to do with segment registers,
+        * despite the name.  This magic affects uaccess.h
+        * macros' behavior.  Reset it to the normal setting.
+        */
+       set_fs(USER_DS);
+#endif
+
        /*
         * Clear the direction flag as per the ABI for function entry.
         */
@@ -571,6 +592,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
        return 0;
 }
 
+#define NR_restart_syscall     __NR_restart_syscall
 /*
  * Note that 'init' is a special process: it doesn't get signals it doesn't
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
        }
 
        /* Did we come from a system call? */
-       if ((long)regs->orig_ax >= 0) {
+       if (syscall_get_nr(current, regs) >= 0) {
                /* Restart the system call - no handlers present */
-               switch (regs->ax) {
+               switch (syscall_get_error(current, regs)) {
                case -ERESTARTNOHAND:
                case -ERESTARTSYS:
                case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
                        break;
 
                case -ERESTART_RESTARTBLOCK:
-                       regs->ax = __NR_restart_syscall;
+                       regs->ax = NR_restart_syscall;
                        regs->ip -= 2;
                        break;
                }
@@ -657,6 +679,12 @@ static void do_signal(struct pt_regs *regs)
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+       /* notify userspace of pending MCEs */
+       if (thread_info_flags & _TIF_MCE_NOTIFY)
+               mce_notify_user();
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
+
        /* deal with pending signal delivery */
        if (thread_info_flags & _TIF_SIGPENDING)
                do_signal(regs);
@@ -666,5 +694,23 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
                tracehook_notify_resume(regs);
        }
 
+#ifdef CONFIG_X86_32
        clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{
+       struct task_struct *me = current;
+
+       if (show_unhandled_signals && printk_ratelimit()) {
+               printk(KERN_INFO
+                      "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+                      me->comm, me->pid, where, frame,
+                      regs->ip, regs->sp, regs->orig_ax);
+               print_vma_addr(" in ", regs->ip);
+               printk(KERN_CONT "\n");
+       }
+
+       force_sig(SIGSEGV, me);
 }
index 823a55b..a5c9627 100644 (file)
@@ -52,6 +52,16 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
        return do_sigaltstack(uss, uoss, regs->sp);
 }
 
+#define COPY(x)                        {               \
+       err |= __get_user(regs->x, &sc->x);     \
+}
+
+#define COPY_SEG_STRICT(seg)   {                       \
+               unsigned short tmp;                     \
+               err |= __get_user(tmp, &sc->seg);       \
+               regs->seg = tmp | 3;                    \
+}
+
 /*
  * Do a signal return; undo the signal stack.
  */
@@ -59,13 +69,13 @@ static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
                   unsigned long *pax)
 {
+       void __user *buf;
+       unsigned int tmpflags;
        unsigned int err = 0;
 
        /* Always make any pending restarted system calls return -EINTR */
        current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)                (err |= __get_user(regs->x, &sc->x))
-
        COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
        COPY(dx); COPY(cx); COPY(ip);
        COPY(r8);
@@ -80,34 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
        /* Kernel saves and restores only the CS segment register on signals,
         * which is the bare minimum needed to allow mixed 32/64-bit code.
         * App's signal handler can save/restore other segments if needed. */
-       {
-               unsigned cs;
-               err |= __get_user(cs, &sc->cs);
-               regs->cs = cs | 3;      /* Force into user mode */
-       }
+       COPY_SEG_STRICT(cs);
 
-       {
-               unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->flags);
-               regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-               regs->orig_ax = -1;             /* disable syscall checks */
-       }
+       err |= __get_user(tmpflags, &sc->flags);
+       regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+       regs->orig_ax = -1;             /* disable syscall checks */
 
-       {
-               struct _fpstate __user *buf;
-               err |= __get_user(buf, &sc->fpstate);
-               err |= restore_i387_xstate(buf);
-       }
+       err |= __get_user(buf, &sc->fpstate);
+       err |= restore_i387_xstate(buf);
 
        err |= __get_user(*pax, &sc->ax);
        return err;
 }
 
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+static long do_rt_sigreturn(struct pt_regs *regs)
 {
        struct rt_sigframe __user *frame;
-       sigset_t set;
        unsigned long ax;
+       sigset_t set;
 
        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
        if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -130,10 +130,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
        return ax;
 
 badframe:
-       signal_fault(regs, frame, "sigreturn");
+       signal_fault(regs, frame, "rt_sigreturn");
        return 0;
 }
 
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+       return do_rt_sigreturn(regs);
+}
+
 /*
  * Set up a signal frame.
  */
@@ -195,8 +200,8 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
        return (void __user *)round_down(sp - size, 64);
 }
 
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                          sigset_t *set, struct pt_regs *regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                           sigset_t *set, struct pt_regs *regs)
 {
        struct rt_sigframe __user *frame;
        void __user *fp = NULL;
@@ -209,17 +214,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                        (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
 
                if (save_i387_xstate(fp) < 0)
-                       err |= -1;
+                       return -EFAULT;
        } else
                frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
 
        if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
+               return -EFAULT;
 
        if (ka->sa.sa_flags & SA_SIGINFO) {
-               err |= copy_siginfo_to_user(&frame->info, info);
-               if (err)
-                       goto give_sigsegv;
+               if (copy_siginfo_to_user(&frame->info, info))
+                       return -EFAULT;
        }
 
        /* Create the ucontext.  */
@@ -247,11 +251,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
        } else {
                /* could use a vstub here */
-               goto give_sigsegv;
+               return -EFAULT;
        }
 
        if (err)
-               goto give_sigsegv;
+               return -EFAULT;
 
        /* Set up registers for signal handler */
        regs->di = sig;
@@ -271,15 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
        regs->cs = __USER_CS;
 
        return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
 }
 
 /*
  * OK, we're invoking a handler
  */
+static int signr_convert(int sig)
+{
+       return sig;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+#define is_ia32        test_thread_flag(TIF_IA32)
+#else
+#define is_ia32        0
+#endif
+
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+              sigset_t *set, struct pt_regs *regs)
+{
+       int usig = signr_convert(sig);
+       int ret;
+
+       /* Set up the stack frame */
+       if (is_ia32) {
+               if (ka->sa.sa_flags & SA_SIGINFO)
+                       ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+               else
+                       ret = ia32_setup_frame(usig, ka, set, regs);
+       } else
+               ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+       if (ret) {
+               force_sigsegv(sig, current);
+               return -EFAULT;
+       }
+
+       return ret;
+}
 
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -317,51 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
            likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
                regs->flags &= ~X86_EFLAGS_TF;
 
-#ifdef CONFIG_IA32_EMULATION
-       if (test_thread_flag(TIF_IA32)) {
-               if (ka->sa.sa_flags & SA_SIGINFO)
-                       ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
-               else
-                       ret = ia32_setup_frame(sig, ka, oldset, regs);
-       } else
-#endif
        ret = setup_rt_frame(sig, ka, info, oldset, regs);
 
-       if (ret == 0) {
-               /*
-                * This has nothing to do with segment registers,
-                * despite the name.  This magic affects uaccess.h
-                * macros' behavior.  Reset it to the normal setting.
-                */
-               set_fs(USER_DS);
+       if (ret)
+               return ret;
 
-               /*
-                * Clear the direction flag as per the ABI for function entry.
-                */
-               regs->flags &= ~X86_EFLAGS_DF;
+#ifdef CONFIG_X86_64
+       /*
+        * This has nothing to do with segment registers,
+        * despite the name.  This magic affects uaccess.h
+        * macros' behavior.  Reset it to the normal setting.
+        */
+       set_fs(USER_DS);
+#endif
 
-               /*
-                * Clear TF when entering the signal handler, but
-                * notify any tracer that was single-stepping it.
-                * The tracer may want to single-step inside the
-                * handler too.
-                */
-               regs->flags &= ~X86_EFLAGS_TF;
+       /*
+        * Clear the direction flag as per the ABI for function entry.
+        */
+       regs->flags &= ~X86_EFLAGS_DF;
 
-               spin_lock_irq(&current->sighand->siglock);
-               sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
-               if (!(ka->sa.sa_flags & SA_NODEFER))
-                       sigaddset(&current->blocked, sig);
-               recalc_sigpending();
-               spin_unlock_irq(&current->sighand->siglock);
+       /*
+        * Clear TF when entering the signal handler, but
+        * notify any tracer that was single-stepping it.
+        * The tracer may want to single-step inside the
+        * handler too.
+        */
+       regs->flags &= ~X86_EFLAGS_TF;
 
-               tracehook_signal_handler(sig, info, ka, regs,
-                                        test_thread_flag(TIF_SINGLESTEP));
-       }
+       spin_lock_irq(&current->sighand->siglock);
+       sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+       if (!(ka->sa.sa_flags & SA_NODEFER))
+               sigaddset(&current->blocked, sig);
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
 
-       return ret;
+       tracehook_signal_handler(sig, info, ka, regs,
+                                test_thread_flag(TIF_SINGLESTEP));
+
+       return 0;
 }
 
+#define NR_restart_syscall     \
+       test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
 /*
  * Note that 'init' is a special process: it doesn't get signals it doesn't
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -391,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
 
        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
        if (signr > 0) {
-               /* Re-enable any watchpoints before delivering the
+               /*
+                * Re-enable any watchpoints before delivering the
                 * signal to user space. The processor register will
                 * have been cleared if the watchpoint triggered
                 * inside the kernel.
@@ -399,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
                if (current->thread.debugreg7)
                        set_debugreg(current->thread.debugreg7, 7);
 
-               /* Whee!  Actually deliver the signal.  */
+               /* Whee! Actually deliver the signal.  */
                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
                        /*
                         * A signal was successfully delivered; the saved
@@ -422,10 +454,9 @@ static void do_signal(struct pt_regs *regs)
                        regs->ax = regs->orig_ax;
                        regs->ip -= 2;
                        break;
+
                case -ERESTART_RESTARTBLOCK:
-                       regs->ax = test_thread_flag(TIF_IA32) ?
-                                       __NR_ia32_restart_syscall :
-                                       __NR_restart_syscall;
+                       regs->ax = NR_restart_syscall;
                        regs->ip -= 2;
                        break;
                }
@@ -441,14 +472,18 @@ static void do_signal(struct pt_regs *regs)
        }
 }
 
-void do_notify_resume(struct pt_regs *regs, void *unused,
-                     __u32 thread_info_flags)
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-#ifdef CONFIG_X86_MCE
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
        /* notify userspace of pending MCEs */
        if (thread_info_flags & _TIF_MCE_NOTIFY)
                mce_notify_user();
-#endif /* CONFIG_X86_MCE */
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
        /* deal with pending signal delivery */
        if (thread_info_flags & _TIF_SIGPENDING)
@@ -458,17 +493,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
                clear_thread_flag(TIF_NOTIFY_RESUME);
                tracehook_notify_resume(regs);
        }
+
+#ifdef CONFIG_X86_32
+       clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
        struct task_struct *me = current;
+
        if (show_unhandled_signals && printk_ratelimit()) {
-               printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
-              me->comm, me->pid, where, frame, regs->ip,
-                  regs->sp, regs->orig_ax);
+               printk(KERN_INFO
+                      "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+                      me->comm, me->pid, where, frame,
+                      regs->ip, regs->sp, regs->orig_ax);
                print_vma_addr(" in ", regs->ip);
-               printk("\n");
+               printk(KERN_CONT "\n");
        }
 
        force_sig(SIGSEGV, me);
index 361b7a4..18f9b19 100644 (file)
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 struct smp_ops smp_ops = {
        .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
        .smp_prepare_cpus = native_smp_prepare_cpus,
-       .cpu_up = native_cpu_up,
        .smp_cpus_done = native_smp_cpus_done,
 
        .smp_send_stop = native_smp_send_stop,
        .smp_send_reschedule = native_smp_send_reschedule,
 
+       .cpu_up = native_cpu_up,
+       .cpu_die = native_cpu_die,
+       .cpu_disable = native_cpu_disable,
+       .play_dead = native_play_dead,
+
        .send_call_func_ipi = native_send_call_func_ipi,
        .send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
index 9056f7e..76b6f50 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/desc.h>
 #include <asm/nmi.h>
 #include <asm/irq.h>
+#include <asm/idle.h>
 #include <asm/smp.h>
 #include <asm/trampoline.h>
 #include <asm/cpu.h>
@@ -1344,25 +1345,9 @@ static void __ref remove_cpu_from_maps(int cpu)
        numa_remove_cpu(cpu);
 }
 
-int __cpu_disable(void)
+void cpu_disable_common(void)
 {
        int cpu = smp_processor_id();
-
-       /*
-        * Perhaps use cpufreq to drop frequency, but that could go
-        * into generic code.
-        *
-        * We won't take down the boot processor on i386 due to some
-        * interrupts only being able to be serviced by the BSP.
-        * Especially so if we're not using an IOAPIC   -zwane
-        */
-       if (cpu == 0)
-               return -EBUSY;
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
-       clear_local_APIC();
-
        /*
         * HACK:
         * Allow any queued timer interrupts to get serviced
@@ -1380,10 +1365,32 @@ int __cpu_disable(void)
        remove_cpu_from_maps(cpu);
        unlock_vector_lock();
        fixup_irqs(cpu_online_map);
+}
+
+int native_cpu_disable(void)
+{
+       int cpu = smp_processor_id();
+
+       /*
+        * Perhaps use cpufreq to drop frequency, but that could go
+        * into generic code.
+        *
+        * We won't take down the boot processor on i386 due to some
+        * interrupts only being able to be serviced by the BSP.
+        * Especially so if we're not using an IOAPIC   -zwane
+        */
+       if (cpu == 0)
+               return -EBUSY;
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               stop_apic_nmi_watchdog(NULL);
+       clear_local_APIC();
+
+       cpu_disable_common();
        return 0;
 }
 
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
 {
        /* We don't do anything here: idle task is faking death itself. */
        unsigned int i;
@@ -1400,15 +1407,45 @@ void __cpu_die(unsigned int cpu)
        }
        printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
+
+void play_dead_common(void)
+{
+       idle_task_exit();
+       reset_lazy_tlbstate();
+       irq_ctx_exit(raw_smp_processor_id());
+       c1e_remove_cpu(raw_smp_processor_id());
+
+       mb();
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       /*
+        * With physical CPU hotplug, we should halt the cpu
+        */
+       local_irq_disable();
+}
+
+void native_play_dead(void)
+{
+       play_dead_common();
+       wbinvd_halt();
+}
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
+int native_cpu_disable(void)
 {
        return -ENOSYS;
 }
 
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
 {
        /* We said "no" in __cpu_disable */
        BUG();
 }
+
+void native_play_dead(void)
+{
+       BUG();
+}
+
 #endif
index fec1ece..e00534b 100644 (file)
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
        on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
+void reset_lazy_tlbstate(void)
+{
+       int cpu = raw_smp_processor_id();
+
+       per_cpu(cpu_tlbstate, cpu).state = 0;
+       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+
index da5a596..0429c5d 100644 (file)
@@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
        struct task_struct *tsk = current;
        unsigned int condition;
+       int si_code;
 
        trace_hardirqs_fixup();
 
@@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
                        goto clear_TF_reenable;
        }
 
+       si_code = get_si_code((unsigned long)condition);
        /* Ok, finally something we can handle */
-       send_sigtrap(tsk, regs, error_code);
+       send_sigtrap(tsk, regs, error_code, si_code);
 
        /*
         * Disable additional traps. They'll be re-enabled when
index 2887a78..9c0ac0c 100644 (file)
@@ -940,7 +940,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs,
        tsk->thread.error_code = error_code;
        info.si_signo = SIGTRAP;
        info.si_errno = 0;
-       info.si_code = TRAP_BRKPT;
+       info.si_code = get_si_code(condition);
        info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
        force_sig_info(SIGTRAP, &info, tsk);
 
index 201e81a..46e0544 100644 (file)
@@ -172,8 +172,8 @@ SECTIONS
   .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
        *(.x86_cpu_dev.init)
   }
-  SECURITY_INIT
   __x86_cpu_dev_end = .;
+  SECURITY_INIT
 
   . = ALIGN(8);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
index 07713d6..9abac8a 100644 (file)
@@ -95,7 +95,9 @@ int save_i387_xstate(void __user *buf)
                 * Start with clearing the user buffer. This will present a
                 * clean context for the bytes not touched by the fxsave/xsave.
                 */
-               __clear_user(buf, sig_xstate_size);
+               err = __clear_user(buf, sig_xstate_size);
+               if (err)
+                       return err;
 
                if (task_thread_info(tsk)->status & TS_XSAVE)
                        err = xsave_user(buf);
@@ -114,6 +116,8 @@ int save_i387_xstate(void __user *buf)
 
        if (task_thread_info(tsk)->status & TS_XSAVE) {
                struct _fpstate __user *fx = buf;
+               struct _xstate __user *x = buf;
+               u64 xstate_bv;
 
                err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
                                     sizeof(struct _fpx_sw_bytes));
@@ -121,6 +125,31 @@ int save_i387_xstate(void __user *buf)
                err |= __put_user(FP_XSTATE_MAGIC2,
                                  (__u32 __user *) (buf + sig_xstate_size
                                                    - FP_XSTATE_MAGIC2_SIZE));
+
+               /*
+                * Read the xstate_bv which we copied (directly from the cpu or
+                * from the state in task struct) to the user buffers and
+                * set the FP/SSE bits.
+                */
+               err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+               /*
+                * For legacy compatible, we always set FP/SSE bits in the bit
+                * vector while saving the state to the user context. This will
+                * enable us capturing any changes(during sigreturn) to
+                * the FP/SSE bits by the legacy applications which don't touch
+                * xstate_bv in the xsave header.
+                *
+                * xsave aware apps can change the xstate_bv in the xsave
+                * header as well as change any contents in the memory layout.
+                * xrestore as part of sigreturn will capture all the changes.
+                */
+               xstate_bv |= XSTATE_FPSSE;
+
+               err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+               if (err)
+                       return err;
        }
 
        return 1;
@@ -272,7 +301,7 @@ void __cpuinit xsave_init(void)
 /*
  * setup the xstate image representing the init state
  */
-void setup_xstate_init(void)
+static void __init setup_xstate_init(void)
 {
        init_xstate_buf = alloc_bootmem(xstate_size);
        init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
index 8f92cac..a742d75 100644 (file)
@@ -914,15 +914,15 @@ LIST_HEAD(pgd_list);
 
 void vmalloc_sync_all(void)
 {
-#ifdef CONFIG_X86_32
-       unsigned long start = VMALLOC_START & PGDIR_MASK;
        unsigned long address;
 
+#ifdef CONFIG_X86_32
        if (SHARED_KERNEL_PMD)
                return;
 
-       BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
-       for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+       for (address = VMALLOC_START & PMD_MASK;
+            address >= TASK_SIZE && address < FIXADDR_TOP;
+            address += PMD_SIZE) {
                unsigned long flags;
                struct page *page;
 
@@ -935,10 +935,8 @@ void vmalloc_sync_all(void)
                spin_unlock_irqrestore(&pgd_lock, flags);
        }
 #else /* CONFIG_X86_64 */
-       unsigned long start = VMALLOC_START & PGDIR_MASK;
-       unsigned long address;
-
-       for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+       for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
+            address += PGDIR_SIZE) {
                const pgd_t *pgd_ref = pgd_offset_k(address);
                unsigned long flags;
                struct page *page;
index c3789bb..bbe044d 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/cpumask.h>
 
 #include <asm/asm.h>
+#include <asm/bios_ebda.h>
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -969,6 +970,8 @@ void __init mem_init(void)
        int codesize, reservedpages, datasize, initsize;
        int tmp;
 
+       start_periodic_check_for_corruption();
+
 #ifdef CONFIG_FLATMEM
        BUG_ON(!mem_map);
 #endif
index 83e13f2..3e10054 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/nmi.h>
 
 #include <asm/processor.h>
+#include <asm/bios_ebda.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -881,6 +882,8 @@ void __init mem_init(void)
 {
        long codesize, reservedpages, datasize, initsize;
 
+       start_periodic_check_for_corruption();
+
        pci_iommu_alloc();
 
        /* clear_bss() already clear the empty_zero_page */
index 6ab3196..8cbeda1 100644 (file)
 
 #ifdef CONFIG_X86_64
 
-unsigned long __phys_addr(unsigned long x)
+static inline int phys_addr_valid(unsigned long addr)
 {
-       if (x >= __START_KERNEL_map)
-               return x - __START_KERNEL_map + phys_base;
-       return x - PAGE_OFFSET;
+       return addr < (1UL << boot_cpu_data.x86_phys_bits);
 }
-EXPORT_SYMBOL(__phys_addr);
 
-static inline int phys_addr_valid(unsigned long addr)
+unsigned long __phys_addr(unsigned long x)
 {
-       return addr < (1UL << boot_cpu_data.x86_phys_bits);
+       if (x >= __START_KERNEL_map) {
+               x -= __START_KERNEL_map;
+               VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
+               x += phys_base;
+       } else {
+               VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+               x -= PAGE_OFFSET;
+               VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
+                                       !phys_addr_valid(x));
+       }
+       return x;
 }
+EXPORT_SYMBOL(__phys_addr);
 
 #else
 
@@ -44,6 +52,17 @@ static inline int phys_addr_valid(unsigned long addr)
        return 1;
 }
 
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+       /* VMALLOC_* aren't constants; not available at the boot time */
+       VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING &&
+                                       is_vmalloc_addr((void *)x)));
+       return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
 #endif
 
 int page_is_ram(unsigned long pagenr)
@@ -614,7 +633,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
         */
        offset = phys_addr & ~PAGE_MASK;
        phys_addr &= PAGE_MASK;
-       size = PAGE_ALIGN(last_addr) - phys_addr;
+       size = PAGE_ALIGN(last_addr + 1) - phys_addr;
 
        /*
         * Mappings have to fit in the FIX_BTMAP area.
index 3815e42..87b9ab1 100644 (file)
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
 
 config XEN_SAVE_RESTORE
        bool
-       depends on PM
-       default y
\ No newline at end of file
+       depends on XEN && PM
+       default y
+
+config XEN_DEBUG_FS
+       bool "Enable Xen debug and tuning parameters in debugfs"
+       depends on XEN && DEBUG_FS
+       default n
+       help
+         Enable statistics output and various tuning options in debugfs.
+         Enabling this option may incur a significant performance overhead.
index 59c1e53..3139479 100644 (file)
@@ -1,4 +1,12 @@
-obj-y          := enlighten.o setup.o multicalls.o mmu.o \
+ifdef CONFIG_FTRACE
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_spinlock.o = -pg
+CFLAGS_REMOVE_time.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
+endif
+
+obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm_$(BITS).o grant-table.o suspend.o
 
-obj-$(CONFIG_SMP)      += smp.o
+obj-$(CONFIG_SMP)              += smp.o spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS)     += debugfs.o
\ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644 (file)
index 0000000..b53225d
--- /dev/null
@@ -0,0 +1,123 @@
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "debugfs.h"
+
+static struct dentry *d_xen_debug;
+
+struct dentry * __init xen_init_debugfs(void)
+{
+       if (!d_xen_debug) {
+               d_xen_debug = debugfs_create_dir("xen", NULL);
+
+               if (!d_xen_debug)
+                       pr_warning("Could not create 'xen' debugfs directory\n");
+       }
+
+       return d_xen_debug;
+}
+
+struct array_data
+{
+       void *array;
+       unsigned elements;
+};
+
+static int u32_array_open(struct inode *inode, struct file *file)
+{
+       file->private_data = NULL;
+       return nonseekable_open(inode, file);
+}
+
+static size_t format_array(char *buf, size_t bufsize, const char *fmt,
+                          u32 *array, unsigned array_size)
+{
+       size_t ret = 0;
+       unsigned i;
+
+       for(i = 0; i < array_size; i++) {
+               size_t len;
+
+               len = snprintf(buf, bufsize, fmt, array[i]);
+               len++;  /* ' ' or '\n' */
+               ret += len;
+
+               if (buf) {
+                       buf += len;
+                       bufsize -= len;
+                       buf[-1] = (i == array_size-1) ? '\n' : ' ';
+               }
+       }
+
+       ret++;          /* \0 */
+       if (buf)
+               *buf = '\0';
+
+       return ret;
+}
+
+static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
+{
+       size_t len = format_array(NULL, 0, fmt, array, array_size);
+       char *ret;
+
+       ret = kmalloc(len, GFP_KERNEL);
+       if (ret == NULL)
+               return NULL;
+
+       format_array(ret, len, fmt, array, array_size);
+       return ret;
+}
+
+static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
+                             loff_t *ppos)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct array_data *data = inode->i_private;
+       size_t size;
+
+       if (*ppos == 0) {
+               if (file->private_data) {
+                       kfree(file->private_data);
+                       file->private_data = NULL;
+               }
+
+               file->private_data = format_array_alloc("%u", data->array, data->elements);
+       }
+
+       size = 0;
+       if (file->private_data)
+               size = strlen(file->private_data);
+
+       return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
+}
+
+static int xen_array_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+
+       return 0;
+}
+
+static struct file_operations u32_array_fops = {
+       .owner  = THIS_MODULE,
+       .open   = u32_array_open,
+       .release= xen_array_release,
+       .read   = u32_array_read,
+};
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+                                           struct dentry *parent,
+                                           u32 *array, unsigned elements)
+{
+       struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
+
+       if (data == NULL)
+               return NULL;
+
+       data->array = array;
+       data->elements = elements;
+
+       return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
+}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644 (file)
index 0000000..e281320
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef _XEN_DEBUGFS_H
+#define _XEN_DEBUGFS_H
+
+struct dentry * __init xen_init_debugfs(void);
+
+struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+                                           struct dentry *parent,
+                                           u32 *array, unsigned elements);
+
+#endif /* _XEN_DEBUGFS_H */
index a27d562..0013a72 100644 (file)
@@ -30,7 +30,6 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvc-console.h>
@@ -58,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
 /*
  * Identity map, in addition to plain kernel map.  This needs to be
  * large enough to allocate page table pages to allocate the rest.
@@ -111,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  *
  * 0: not available, 1: available
  */
-static int have_vcpu_info_placement = 1;
+static int have_vcpu_info_placement =
+#ifdef CONFIG_X86_32
+       1
+#else
+       0
+#endif
+       ;
+
 
 static void xen_vcpu_setup(int cpu)
 {
@@ -227,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
        return HYPERVISOR_get_debugreg(reg);
 }
 
-static unsigned long xen_save_fl(void)
+static void xen_leave_lazy(void)
 {
-       struct vcpu_info *vcpu;
-       unsigned long flags;
-
-       vcpu = x86_read_percpu(xen_vcpu);
-
-       /* flag has opposite sense of mask */
-       flags = !vcpu->evtchn_upcall_mask;
-
-       /* convert to IF type flag
-          -0 -> 0x00000000
-          -1 -> 0xffffffff
-       */
-       return (-flags) & X86_EFLAGS_IF;
+       paravirt_leave_lazy(paravirt_get_lazy_mode());
+       xen_mc_flush();
 }
 
-static void xen_restore_fl(unsigned long flags)
+static unsigned long xen_store_tr(void)
 {
-       struct vcpu_info *vcpu;
-
-       /* convert from IF type flag */
-       flags = !(flags & X86_EFLAGS_IF);
-
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = flags;
-       preempt_enable_no_resched();
-
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
-
-       if (flags == 0) {
-               preempt_check_resched();
-               barrier(); /* unmask then check (avoid races) */
-               if (unlikely(vcpu->evtchn_upcall_pending))
-                       force_evtchn_callback();
-       }
+       return 0;
 }
 
-static void xen_irq_disable(void)
+/*
+ * Set the page permissions for a particular virtual address.  If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
 {
-       /* There's a one instruction preempt window here.  We need to
-          make sure we're don't switch CPUs between getting the vcpu
-          pointer and updating the mask. */
-       preempt_disable();
-       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
-       preempt_enable_no_resched();
-}
+       int level;
+       pte_t *ptep;
+       pte_t pte;
+       unsigned long pfn;
+       struct page *page;
 
-static void xen_irq_enable(void)
-{
-       struct vcpu_info *vcpu;
+       ptep = lookup_address((unsigned long)v, &level);
+       BUG_ON(ptep == NULL);
 
-       /* We don't need to worry about being preempted here, since
-          either a) interrupts are disabled, so no preemption, or b)
-          the caller is confused and is trying to re-enable interrupts
-          on an indeterminate processor. */
+       pfn = pte_pfn(*ptep);
+       page = pfn_to_page(pfn);
 
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = 0;
+       pte = pfn_pte(pfn, prot);
 
-       /* Doesn't matter if we get preempted here, because any
-          pending event will get dealt with anyway. */
+       if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+               BUG();
 
-       barrier(); /* unmask then check (avoid races) */
-       if (unlikely(vcpu->evtchn_upcall_pending))
-               force_evtchn_callback();
-}
+       if (!PageHighMem(page)) {
+               void *av = __va(PFN_PHYS(pfn));
 
-static void xen_safe_halt(void)
-{
-       /* Blocking includes an implicit local_irq_enable(). */
-       if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
-               BUG();
+               if (av != v)
+                       if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+                               BUG();
+       } else
+               kmap_flush_unused();
 }
 
-static void xen_halt(void)
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
-       if (irqs_disabled())
-               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
-       else
-               xen_safe_halt();
-}
+       const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+       int i;
 
-static void xen_leave_lazy(void)
-{
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
-       xen_mc_flush();
+       for(i = 0; i < entries; i += entries_per_page)
+               set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
 }
 
-static unsigned long xen_store_tr(void)
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
 {
-       return 0;
+       const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+       int i;
+
+       for(i = 0; i < entries; i += entries_per_page)
+               set_aliased_prot(ldt + i, PAGE_KERNEL);
 }
 
 static void xen_set_ldt(const void *addr, unsigned entries)
@@ -426,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
 static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
                                const void *ptr)
 {
-       unsigned long lp = (unsigned long)&dt[entrynum];
-       xmaddr_t mach_lp = virt_to_machine(lp);
+       xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
        u64 entry = *(u64 *)ptr;
 
        preempt_disable();
@@ -560,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 }
 
 static void xen_load_sp0(struct tss_struct *tss,
-                         struct thread_struct *thread)
+                        struct thread_struct *thread)
 {
        struct multicall_space mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -835,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
                        ret = -EFAULT;
                break;
 #endif
+
+       case MSR_STAR:
+       case MSR_CSTAR:
+       case MSR_LSTAR:
+       case MSR_SYSCALL_MASK:
+       case MSR_IA32_SYSENTER_CS:
+       case MSR_IA32_SYSENTER_ESP:
+       case MSR_IA32_SYSENTER_EIP:
+               /* Fast syscall setup is all done in hypercalls, so
+                  these are all ignored.  Stub them out here to stop
+                  Xen console noise. */
+               break;
+
        default:
                ret = native_write_msr_safe(msr, low, high);
        }
@@ -878,8 +864,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
                SetPagePinned(page);
 
                if (!PageHighMem(page)) {
-                       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-                       if (level == PT_PTE)
+                       make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
                                pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
                } else
                        /* make sure there are no stray mappings of
@@ -947,7 +933,7 @@ static void xen_release_ptpage(unsigned long pfn, unsigned level)
 
        if (PagePinned(page)) {
                if (!PageHighMem(page)) {
-                       if (level == PT_PTE)
+                       if (level == PT_PTE && USE_SPLIT_PTLOCKS)
                                pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
                }
@@ -994,6 +980,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
+#ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
        /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -1012,6 +999,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 
        xen_set_pte(ptep, pte);
 }
+#endif
 
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
@@ -1078,7 +1066,6 @@ void xen_setup_vcpu_info_placement(void)
 
        /* xen_vcpu_setup managed to place the vcpu_info within the
           percpu area for all cpus, so make use of it */
-#ifdef CONFIG_X86_32
        if (have_vcpu_info_placement) {
                printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
@@ -1088,7 +1075,6 @@ void xen_setup_vcpu_info_placement(void)
                pv_irq_ops.irq_enable = xen_irq_enable_direct;
                pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
        }
-#endif
 }
 
 static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1109,12 +1095,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
        goto patch_site
 
        switch (type) {
-#ifdef CONFIG_X86_32
                SITE(pv_irq_ops, irq_enable);
                SITE(pv_irq_ops, irq_disable);
                SITE(pv_irq_ops, save_fl);
                SITE(pv_irq_ops, restore_fl);
-#endif /* CONFIG_X86_32 */
 #undef SITE
 
        patch_site:
@@ -1252,6 +1236,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .load_gs_index = xen_load_gs_index,
 #endif
 
+       .alloc_ldt = xen_alloc_ldt,
+       .free_ldt = xen_free_ldt,
+
        .store_gdt = native_store_gdt,
        .store_idt = native_store_idt,
        .store_tr = xen_store_tr,
@@ -1273,36 +1260,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        },
 };
 
-static void __init __xen_init_IRQ(void)
-{
-#ifdef CONFIG_X86_64
-       int i;
-
-       /* Create identity vector->irq map */
-       for(i = 0; i < NR_VECTORS; i++) {
-               int cpu;
-
-               for_each_possible_cpu(cpu)
-                       per_cpu(vector_irq, cpu)[i] = i;
-       }
-#endif /* CONFIG_X86_64 */
-
-       xen_init_IRQ();
-}
-
-static const struct pv_irq_ops xen_irq_ops __initdata = {
-       .init_IRQ = __xen_init_IRQ,
-       .save_fl = xen_save_fl,
-       .restore_fl = xen_restore_fl,
-       .irq_disable = xen_irq_disable,
-       .irq_enable = xen_irq_enable,
-       .safe_halt = xen_safe_halt,
-       .halt = xen_halt,
-#ifdef CONFIG_X86_64
-       .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
-};
-
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
        .setup_boot_clock = paravirt_nop,
@@ -1443,7 +1400,7 @@ static void __init xen_reserve_top(void)
        if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
                top = pp.virt_start;
 
-       reserve_top_address(-top + 2 * PAGE_SIZE);
+       reserve_top_address(-top);
 #endif /* CONFIG_X86_32 */
 }
 
@@ -1477,48 +1434,11 @@ static void *m2v(phys_addr_t maddr)
        return __ka(m2p(maddr));
 }
 
-#ifdef CONFIG_X86_64
-static void walk(pgd_t *pgd, unsigned long addr)
-{
-       unsigned l4idx = pgd_index(addr);
-       unsigned l3idx = pud_index(addr);
-       unsigned l2idx = pmd_index(addr);
-       unsigned l1idx = pte_index(addr);
-       pgd_t l4;
-       pud_t l3;
-       pmd_t l2;
-       pte_t l1;
-
-       xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
-                      pgd, addr, l4idx, l3idx, l2idx, l1idx);
-
-       l4 = pgd[l4idx];
-       xen_raw_printk("  l4: %016lx\n", l4.pgd);
-       xen_raw_printk("      %016lx\n", pgd_val(l4));
-
-       l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
-       xen_raw_printk("  l3: %016lx\n", l3.pud);
-       xen_raw_printk("      %016lx\n", pud_val(l3));
-
-       l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
-       xen_raw_printk("  l2: %016lx\n", l2.pmd);
-       xen_raw_printk("      %016lx\n", pmd_val(l2));
-
-       l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
-       xen_raw_printk("  l1: %016lx\n", l1.pte);
-       xen_raw_printk("      %016lx\n", pte_val(l1));
-}
-#endif
-
 static void set_page_prot(void *addr, pgprot_t prot)
 {
        unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
        pte_t pte = pfn_pte(pfn, prot);
 
-       xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
-                      addr, pfn, get_phys_to_machine(pfn),
-                      pgprot_val(prot), pte.pte);
-
        if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
                BUG();
 }
@@ -1694,6 +1614,8 @@ asmlinkage void __init xen_start_kernel(void)
        if (!xen_start_info)
                return;
 
+       xen_domain_type = XEN_PV_DOMAIN;
+
        BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
 
        xen_setup_features();
@@ -1703,10 +1625,11 @@ asmlinkage void __init xen_start_kernel(void)
        pv_init_ops = xen_init_ops;
        pv_time_ops = xen_time_ops;
        pv_cpu_ops = xen_cpu_ops;
-       pv_irq_ops = xen_irq_ops;
        pv_apic_ops = xen_apic_ops;
        pv_mmu_ops = xen_mmu_ops;
 
+       xen_init_irq_ops();
+
 #ifdef CONFIG_X86_LOCAL_APIC
        /*
         * set up the basic apic ops.
@@ -1737,7 +1660,7 @@ asmlinkage void __init xen_start_kernel(void)
 
        /* Prevent unwanted bits from being set in PTEs. */
        __supported_pte_mask &= ~_PAGE_GLOBAL;
-       if (!is_initial_xendomain())
+       if (!xen_initial_domain())
                __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
        /* Don't do the full vcpu_info placement stuff until we have a
@@ -1772,7 +1695,7 @@ asmlinkage void __init xen_start_kernel(void)
        boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
        boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
-       if (!is_initial_xendomain()) {
+       if (!xen_initial_domain()) {
                add_preferred_console("xenboot", 0, NULL);
                add_preferred_console("tty", 0, NULL);
                add_preferred_console("hvc", 0, NULL);
@@ -1780,15 +1703,6 @@ asmlinkage void __init xen_start_kernel(void)
 
        xen_raw_console_write("about to get started...\n");
 
-#if 0
-       xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
-                      &boot_params, __pa_symbol(&boot_params),
-                      __va(__pa_symbol(&boot_params)));
-
-       walk(pgd, &boot_params);
-       walk(pgd, __va(__pa(&boot_params)));
-#endif
-
        /* Start the world */
 #ifdef CONFIG_X86_32
        i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644 (file)
index 0000000..28b85ab
--- /dev/null
@@ -0,0 +1,143 @@
+#include <linux/hardirq.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void xen_force_evtchn_callback(void)
+{
+       (void)HYPERVISOR_xen_version(0, NULL);
+}
+
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+       int i;
+
+       /* Create identity vector->irq map */
+       for(i = 0; i < NR_VECTORS; i++) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       per_cpu(vector_irq, cpu)[i] = i;
+       }
+#endif /* CONFIG_X86_64 */
+
+       xen_init_IRQ();
+}
+
+static unsigned long xen_save_fl(void)
+{
+       struct vcpu_info *vcpu;
+       unsigned long flags;
+
+       vcpu = x86_read_percpu(xen_vcpu);
+
+       /* flag has opposite sense of mask */
+       flags = !vcpu->evtchn_upcall_mask;
+
+       /* convert to IF type flag
+          -0 -> 0x00000000
+          -1 -> 0xffffffff
+       */
+       return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+       struct vcpu_info *vcpu;
+
+       /* convert from IF type flag */
+       flags = !(flags & X86_EFLAGS_IF);
+
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
+       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu->evtchn_upcall_mask = flags;
+       preempt_enable_no_resched();
+
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
+
+       if (flags == 0) {
+               preempt_check_resched();
+               barrier(); /* unmask then check (avoid races) */
+               if (unlikely(vcpu->evtchn_upcall_pending))
+                       xen_force_evtchn_callback();
+       }
+}
+
+static void xen_irq_disable(void)
+{
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
+       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+       preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+       struct vcpu_info *vcpu;
+
+       /* We don't need to worry about being preempted here, since
+          either a) interrupts are disabled, so no preemption, or b)
+          the caller is confused and is trying to re-enable interrupts
+          on an indeterminate processor. */
+
+       vcpu = x86_read_percpu(xen_vcpu);
+       vcpu->evtchn_upcall_mask = 0;
+
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
+
+       barrier(); /* unmask then check (avoid races) */
+       if (unlikely(vcpu->evtchn_upcall_pending))
+               xen_force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+       /* Blocking includes an implicit local_irq_enable(). */
+       if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
+               BUG();
+}
+
+static void xen_halt(void)
+{
+       if (irqs_disabled())
+               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+       else
+               xen_safe_halt();
+}
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+       .init_IRQ = __xen_init_IRQ,
+       .save_fl = xen_save_fl,
+       .restore_fl = xen_restore_fl,
+       .irq_disable = xen_irq_disable,
+       .irq_enable = xen_irq_enable,
+       .safe_halt = xen_safe_halt,
+       .halt = xen_halt,
+#ifdef CONFIG_X86_64
+       .adjust_exception_frame = xen_adjust_exception_frame,
+#endif
+};
+
+void __init xen_init_irq_ops()
+{
+       pv_irq_ops = xen_irq_ops;
+}
index aa37469..ae173f6 100644 (file)
@@ -40,6 +40,7 @@
  */
 #include <linux/sched.h>
 #include <linux/highmem.h>
+#include <linux/debugfs.h>
 #include <linux/bug.h>
 
 #include <asm/pgtable.h>
 
 #include "multicalls.h"
 #include "mmu.h"
+#include "debugfs.h"
+
+#define MMU_UPDATE_HISTO       30
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct {
+       u32 pgd_update;
+       u32 pgd_update_pinned;
+       u32 pgd_update_batched;
+
+       u32 pud_update;
+       u32 pud_update_pinned;
+       u32 pud_update_batched;
+
+       u32 pmd_update;
+       u32 pmd_update_pinned;
+       u32 pmd_update_batched;
+
+       u32 pte_update;
+       u32 pte_update_pinned;
+       u32 pte_update_batched;
+
+       u32 mmu_update;
+       u32 mmu_update_extended;
+       u32 mmu_update_histo[MMU_UPDATE_HISTO];
+
+       u32 prot_commit;
+       u32 prot_commit_batched;
+
+       u32 set_pte_at;
+       u32 set_pte_at_batched;
+       u32 set_pte_at_pinned;
+       u32 set_pte_at_current;
+       u32 set_pte_at_kernel;
+} mmu_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+       if (unlikely(zero_stats)) {
+               memset(&mmu_stats, 0, sizeof(mmu_stats));
+               zero_stats = 0;
+       }
+}
+
+#define ADD_STATS(elem, val)                   \
+       do { check_zero(); mmu_stats.elem += (val); } while(0)
+
+#else  /* !CONFIG_XEN_DEBUG_FS */
+
+#define ADD_STATS(elem, val)   do { (void)(val); } while(0)
+
+#endif /* CONFIG_XEN_DEBUG_FS */
 
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
 }
 
 
-static bool page_pinned(void *ptr)
+static bool xen_page_pinned(void *ptr)
 {
        struct page *page = virt_to_page(ptr);
 
        return PagePinned(page);
 }
 
-static void extend_mmu_update(const struct mmu_update *update)
+static void xen_extend_mmu_update(const struct mmu_update *update)
 {
        struct multicall_space mcs;
        struct mmu_update *u;
 
        mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 
-       if (mcs.mc != NULL)
+       if (mcs.mc != NULL) {
+               ADD_STATS(mmu_update_extended, 1);
+               ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
+
                mcs.mc->args[1]++;
-       else {
+
+               if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
+                       ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
+               else
+                       ADD_STATS(mmu_update_histo[0], 1);
+       } else {
+               ADD_STATS(mmu_update, 1);
                mcs = __xen_mc_entry(sizeof(*u));
                MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+               ADD_STATS(mmu_update_histo[1], 1);
        }
 
        u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
        /* ptr may be ioremapped for 64-bit pagetable setup */
        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
        u.val = pmd_val_ma(val);
-       extend_mmu_update(&u);
+       xen_extend_mmu_update(&u);
+
+       ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 
        xen_mc_issue(PARAVIRT_LAZY_MMU);
 
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
+       ADD_STATS(pmd_update, 1);
+
        /* If page is not pinned, we can just update the entry
           directly */
-       if (!page_pinned(ptr)) {
+       if (!xen_page_pinned(ptr)) {
                *ptr = val;
                return;
        }
 
+       ADD_STATS(pmd_update_pinned, 1);
+
        xen_set_pmd_hyper(ptr, val);
 }
 
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
        if (mm == &init_mm)
                preempt_disable();
 
+       ADD_STATS(set_pte_at, 1);
+//     ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
+       ADD_STATS(set_pte_at_current, mm == current->mm);
+       ADD_STATS(set_pte_at_kernel, mm == &init_mm);
+
        if (mm == current->mm || mm == &init_mm) {
                if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
                        struct multicall_space mcs;
                        mcs = xen_mc_entry(0);
 
                        MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+                       ADD_STATS(set_pte_at_batched, 1);
                        xen_mc_issue(PARAVIRT_LAZY_MMU);
                        goto out;
                } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 
        u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
        u.val = pte_val_ma(pte);
-       extend_mmu_update(&u);
+       xen_extend_mmu_update(&u);
+
+       ADD_STATS(prot_commit, 1);
+       ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 
        xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
        /* ptr may be ioremapped for 64-bit pagetable setup */
        u.ptr = arbitrary_virt_to_machine(ptr).maddr;
        u.val = pud_val_ma(val);
-       extend_mmu_update(&u);
+       xen_extend_mmu_update(&u);
+
+       ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 
        xen_mc_issue(PARAVIRT_LAZY_MMU);
 
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 void xen_set_pud(pud_t *ptr, pud_t val)
 {
+       ADD_STATS(pud_update, 1);
+
        /* If page is not pinned, we can just update the entry
           directly */
-       if (!page_pinned(ptr)) {
+       if (!xen_page_pinned(ptr)) {
                *ptr = val;
                return;
        }
 
+       ADD_STATS(pud_update_pinned, 1);
+
        xen_set_pud_hyper(ptr, val);
 }
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
+       ADD_STATS(pte_update, 1);
+//     ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
+       ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
 #ifdef CONFIG_X86_PAE
        ptep->pte_high = pte.pte_high;
        smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 
        u.ptr = virt_to_machine(ptr).maddr;
        u.val = pgd_val_ma(val);
-       extend_mmu_update(&u);
+       xen_extend_mmu_update(&u);
 }
 
 /*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
 {
        pgd_t *user_ptr = xen_get_user_pgd(ptr);
 
+       ADD_STATS(pgd_update, 1);
+
        /* If page is not pinned, we can just update the entry
           directly */
-       if (!page_pinned(ptr)) {
+       if (!xen_page_pinned(ptr)) {
                *ptr = val;
                if (user_ptr) {
-                       WARN_ON(page_pinned(user_ptr));
+                       WARN_ON(xen_page_pinned(user_ptr));
                        *user_ptr = val;
                }
                return;
        }
 
+       ADD_STATS(pgd_update_pinned, 1);
+       ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+
        /* If it's pinned, then we can at least batch the kernel and
           user updates together. */
        xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
  * For 64-bit, we must skip the Xen hole in the middle of the address
  * space, just after the big x86-64 virtual hole.
  */
-static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
-                   unsigned long limit)
+static int xen_pgd_walk(struct mm_struct *mm,
+                       int (*func)(struct mm_struct *mm, struct page *,
+                                   enum pt_level),
+                       unsigned long limit)
 {
+       pgd_t *pgd = mm->pgd;
        int flush = 0;
        unsigned hole_low, hole_high;
        unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
        pmdidx_limit = 0;
 #endif
 
-       flush |= (*func)(virt_to_page(pgd), PT_PGD);
-
        for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
                pud_t *pud;
 
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
                pud = pud_offset(&pgd[pgdidx], 0);
 
                if (PTRS_PER_PUD > 1) /* not folded */
-                       flush |= (*func)(virt_to_page(pud), PT_PUD);
+                       flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 
                for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
                        pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
                        pmd = pmd_offset(&pud[pudidx], 0);
 
                        if (PTRS_PER_PMD > 1) /* not folded */
-                               flush |= (*func)(virt_to_page(pmd), PT_PMD);
+                               flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 
                        for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
                                struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
                                        continue;
 
                                pte = pmd_page(pmd[pmdidx]);
-                               flush |= (*func)(pte, PT_PTE);
+                               flush |= (*func)(mm, pte, PT_PTE);
                        }
                }
        }
+
 out:
+       /* Do the top level last, so that the callbacks can use it as
+          a cue to do final things like tlb flushes. */
+       flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 
        return flush;
 }
 
-static spinlock_t *lock_pte(struct page *page)
+/* If we're using split pte locks, then take the page's lock and
+   return a pointer to it.  Otherwise return NULL. */
+static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 {
        spinlock_t *ptl = NULL;
 
-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+#if USE_SPLIT_PTLOCKS
        ptl = __pte_lockptr(page);
-       spin_lock(ptl);
+       spin_lock_nest_lock(ptl, &mm->page_table_lock);
 #endif
 
        return ptl;
 }
 
-static void do_unlock(void *v)
+static void xen_pte_unlock(void *v)
 {
        spinlock_t *ptl = v;
        spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 }
 
-static int pin_page(struct page *page, enum pt_level level)
+static int xen_pin_page(struct mm_struct *mm, struct page *page,
+                       enum pt_level level)
 {
        unsigned pgfl = TestSetPagePinned(page);
        int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
 
                flush = 0;
 
+               /*
+                * We need to hold the pagetable lock between the time
+                * we make the pagetable RO and when we actually pin
+                * it.  If we don't, then other users may come in and
+                * attempt to update the pagetable by writing it,
+                * which will fail because the memory is RO but not
+                * pinned, so Xen won't do the trap'n'emulate.
+                *
+                * If we're using split pte locks, we can't hold the
+                * entire pagetable's worth of locks during the
+                * traverse, because we may wrap the preempt count (8
+                * bits).  The solution is to mark RO and pin each PTE
+                * page while holding the lock.  This means the number
+                * of locks we end up holding is never more than a
+                * batch size (~32 entries, at present).
+                *
+                * If we're not using split pte locks, we needn't pin
+                * the PTE pages independently, because we're
+                * protected by the overall pagetable lock.
+                */
                ptl = NULL;
                if (level == PT_PTE)
-                       ptl = lock_pte(page);
+                       ptl = xen_pte_lock(page, mm);
 
                MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
                                        pfn_pte(pfn, PAGE_KERNEL_RO),
                                        level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 
-               if (level == PT_PTE)
+               if (ptl) {
                        xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 
-               if (ptl) {
                        /* Queue a deferred unlock for when this batch
                           is completed. */
-                       xen_mc_callback(do_unlock, ptl);
+                       xen_mc_callback(xen_pte_unlock, ptl);
                }
        }
 
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
 /* This is called just after a mm has been created, but it has not
    been used yet.  We need to make sure that its pagetable is all
    read-only, and can be pinned. */
-void xen_pgd_pin(pgd_t *pgd)
+static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
        xen_mc_batch();
 
-       if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
+       if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
                /* re-enable interrupts for kmap_flush_unused */
                xen_mc_issue(0);
                kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
                xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 
                if (user_pgd) {
-                       pin_page(virt_to_page(user_pgd), PT_PGD);
+                       xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
                        xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
                }
        }
 #else /* CONFIG_X86_32 */
 #ifdef CONFIG_X86_PAE
        /* Need to make sure unshared kernel PMD is pinnable */
-       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+       xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+                    PT_PMD);
 #endif
        xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 #endif /* CONFIG_X86_64 */
        xen_mc_issue(0);
 }
 
+static void xen_pgd_pin(struct mm_struct *mm)
+{
+       __xen_pgd_pin(mm, mm->pgd);
+}
+
 /*
  * On save, we need to pin all pagetables to make sure they get their
  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
  * them (unpinned pgds are not currently in use, probably because the
  * process is under construction or destruction).
+ *
+ * Expected to be called in stop_machine() ("equivalent to taking
+ * every spinlock in the system"), so the locking doesn't really
+ * matter all that much.
  */
 void xen_mm_pin_all(void)
 {
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
 
        list_for_each_entry(page, &pgd_list, lru) {
                if (!PagePinned(page)) {
-                       xen_pgd_pin((pgd_t *)page_address(page));
+                       __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
                        SetPageSavePinned(page);
                }
        }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
  * that's before we have page structures to store the bits.  So do all
  * the book-keeping now.
  */
-static __init int mark_pinned(struct page *page, enum pt_level level)
+static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+                                 enum pt_level level)
 {
        SetPagePinned(page);
        return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
 
 void __init xen_mark_init_mm_pinned(void)
 {
-       pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+       xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
 
-static int unpin_page(struct page *page, enum pt_level level)
+static int xen_unpin_page(struct mm_struct *mm, struct page *page,
+                         enum pt_level level)
 {
        unsigned pgfl = TestClearPagePinned(page);
 
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
                spinlock_t *ptl = NULL;
                struct multicall_space mcs;
 
+               /*
+                * Do the converse to pin_page.  If we're using split
+                * pte locks, we must be holding the lock for while
+                * the pte page is unpinned but still RO to prevent
+                * concurrent updates from seeing it in this
+                * partially-pinned state.
+                */
                if (level == PT_PTE) {
-                       ptl = lock_pte(page);
+                       ptl = xen_pte_lock(page, mm);
 
-                       xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+                       if (ptl)
+                               xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
                }
 
                mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
 
                if (ptl) {
                        /* unlock when batch completed */
-                       xen_mc_callback(do_unlock, ptl);
+                       xen_mc_callback(xen_pte_unlock, ptl);
                }
        }
 
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
 }
 
 /* Release a pagetables pages back as normal RW */
-static void xen_pgd_unpin(pgd_t *pgd)
+static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 {
        xen_mc_batch();
 
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
 
                if (user_pgd) {
                        xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
-                       unpin_page(virt_to_page(user_pgd), PT_PGD);
+                       xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
                }
        }
 #endif
 
 #ifdef CONFIG_X86_PAE
        /* Need to make sure unshared kernel PMD is unpinned */
-       pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+       xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
+                      PT_PMD);
 #endif
 
-       pgd_walk(pgd, unpin_page, USER_LIMIT);
+       xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
 
        xen_mc_issue(0);
 }
 
+static void xen_pgd_unpin(struct mm_struct *mm)
+{
+       __xen_pgd_unpin(mm, mm->pgd);
+}
+
 /*
  * On resume, undo any pinning done at save, so that the rest of the
  * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
        list_for_each_entry(page, &pgd_list, lru) {
                if (PageSavePinned(page)) {
                        BUG_ON(!PagePinned(page));
-                       xen_pgd_unpin((pgd_t *)page_address(page));
+                       __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
                        ClearPageSavePinned(page);
                }
        }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
        spin_lock(&next->page_table_lock);
-       xen_pgd_pin(next->pgd);
+       xen_pgd_pin(next);
        spin_unlock(&next->page_table_lock);
 }
 
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
        spin_lock(&mm->page_table_lock);
-       xen_pgd_pin(mm->pgd);
+       xen_pgd_pin(mm);
        spin_unlock(&mm->page_table_lock);
 }
 
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
        }
 }
 
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
 {
        cpumask_t mask;
        unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
                smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
-static void drop_mm_ref(struct mm_struct *mm)
+static void xen_drop_mm_ref(struct mm_struct *mm)
 {
        if (current->active_mm == mm)
                load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
 void xen_exit_mmap(struct mm_struct *mm)
 {
        get_cpu();              /* make sure we don't move around */
-       drop_mm_ref(mm);
+       xen_drop_mm_ref(mm);
        put_cpu();
 
        spin_lock(&mm->page_table_lock);
 
        /* pgd may not be pinned in the error exit path of execve */
-       if (page_pinned(mm->pgd))
-               xen_pgd_unpin(mm->pgd);
+       if (xen_page_pinned(mm->pgd))
+               xen_pgd_unpin(mm);
 
        spin_unlock(&mm->page_table_lock);
 }
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_mmu_debugfs(void)
+{
+       struct dentry *d_xen = xen_init_debugfs();
+
+       if (d_xen == NULL)
+               return -ENOMEM;
+
+       d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+       debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
+
+       debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
+       debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
+                          &mmu_stats.pgd_update_pinned);
+       debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
+                          &mmu_stats.pgd_update_pinned);
+
+       debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
+       debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
+                          &mmu_stats.pud_update_pinned);
+       debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
+                          &mmu_stats.pud_update_pinned);
+
+       debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
+       debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
+                          &mmu_stats.pmd_update_pinned);
+       debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
+                          &mmu_stats.pmd_update_pinned);
+
+       debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
+//     debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
+//                        &mmu_stats.pte_update_pinned);
+       debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
+                          &mmu_stats.pte_update_pinned);
+
+       debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
+       debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
+                          &mmu_stats.mmu_update_extended);
+       xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
+                                    mmu_stats.mmu_update_histo, 20);
+
+       debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
+       debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
+                          &mmu_stats.set_pte_at_batched);
+       debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
+                          &mmu_stats.set_pte_at_current);
+       debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
+                          &mmu_stats.set_pte_at_kernel);
+
+       debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
+       debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
+                          &mmu_stats.prot_commit_batched);
+
+       return 0;
+}
+fs_initcall(xen_mmu_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
index 0f59bd0..98d7165 100644 (file)
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
 void xen_exit_mmap(struct mm_struct *mm);
 
-void xen_pgd_pin(pgd_t *pgd);
-//void xen_pgd_unpin(pgd_t *pgd);
-
 pteval_t xen_pte_val(pte_t);
 pmdval_t xen_pmd_val(pmd_t);
 pgdval_t xen_pgd_val(pgd_t);
index 9efd1c6..8ea8a0d 100644 (file)
  */
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/debugfs.h>
 
 #include <asm/xen/hypercall.h>
 
 #include "multicalls.h"
+#include "debugfs.h"
+
+#define MC_BATCH       32
 
 #define MC_DEBUG       1
 
-#define MC_BATCH       32
 #define MC_ARGS                (MC_BATCH * 16)
 
+
 struct mc_buffer {
        struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
 DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
+/* flush reasons 0- slots, 1- args, 2- callbacks */
+enum flush_reasons
+{
+       FL_SLOTS,
+       FL_ARGS,
+       FL_CALLBACKS,
+
+       FL_N_REASONS
+};
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#define NHYPERCALLS    40              /* not really */
+
+static struct {
+       unsigned histo[MC_BATCH+1];
+
+       unsigned issued;
+       unsigned arg_total;
+       unsigned hypercalls;
+       unsigned histo_hypercalls[NHYPERCALLS];
+
+       unsigned flush[FL_N_REASONS];
+} mc_stats;
+
+static u8 zero_stats;
+
+static inline void check_zero(void)
+{
+       if (unlikely(zero_stats)) {
+               memset(&mc_stats, 0, sizeof(mc_stats));
+               zero_stats = 0;
+       }
+}
+
+static void mc_add_stats(const struct mc_buffer *mc)
+{
+       int i;
+
+       check_zero();
+
+       mc_stats.issued++;
+       mc_stats.hypercalls += mc->mcidx;
+       mc_stats.arg_total += mc->argidx;
+
+       mc_stats.histo[mc->mcidx]++;
+       for(i = 0; i < mc->mcidx; i++) {
+               unsigned op = mc->entries[i].op;
+               if (op < NHYPERCALLS)
+                       mc_stats.histo_hypercalls[op]++;
+       }
+}
+
+static void mc_stats_flush(enum flush_reasons idx)
+{
+       check_zero();
+
+       mc_stats.flush[idx]++;
+}
+
+#else  /* !CONFIG_XEN_DEBUG_FS */
+
+static inline void mc_add_stats(const struct mc_buffer *mc)
+{
+}
+
+static inline void mc_stats_flush(enum flush_reasons idx)
+{
+}
+#endif /* CONFIG_XEN_DEBUG_FS */
+
 void xen_mc_flush(void)
 {
        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
           something in the middle */
        local_irq_save(flags);
 
+       mc_add_stats(b);
+
        if (b->mcidx) {
 #if MC_DEBUG
                memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
 
        if (b->mcidx == MC_BATCH ||
            (argidx + args) > MC_ARGS) {
+               mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
                xen_mc_flush();
                argidx = roundup(b->argidx, sizeof(u64));
        }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
        struct mc_buffer *b = &__get_cpu_var(mc_buffer);
        struct callback *cb;
 
-       if (b->cbidx == MC_BATCH)
+       if (b->cbidx == MC_BATCH) {
+               mc_stats_flush(FL_CALLBACKS);
                xen_mc_flush();
+       }
 
        cb = &b->callbacks[b->cbidx++];
        cb->fn = fn;
        cb->data = data;
 }
+
+#ifdef CONFIG_XEN_DEBUG_FS
+
+static struct dentry *d_mc_debug;
+
+static int __init xen_mc_debugfs(void)
+{
+       struct dentry *d_xen = xen_init_debugfs();
+
+       if (d_xen == NULL)
+               return -ENOMEM;
+
+       d_mc_debug = debugfs_create_dir("multicalls", d_xen);
+
+       debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
+
+       debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
+       debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
+       debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
+
+       xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
+                                    mc_stats.histo, MC_BATCH);
+       xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
+                                    mc_stats.histo_hypercalls, NHYPERCALLS);
+       xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
+                                    mc_stats.flush, FL_N_REASONS);
+
+       return 0;
+}
+fs_initcall(xen_mc_debugfs);
+
+#endif /* CONFIG_XEN_DEBUG_FS */
index d8faf79..d77da61 100644 (file)
  * useful topology information for the kernel to make use of.  As a
  * result, all CPUs are treated as if they're single-core and
  * single-threaded.
- *
- * This does not handle HOTPLUG_CPU yet.
  */
 #include <linux/sched.h>
-#include <linux/kernel_stat.h>
 #include <linux/err.h>
 #include <linux/smp.h>
 
@@ -36,8 +33,6 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-static void __cpuinit xen_init_lock_cpu(int cpu);
-
 cpumask_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static __cpuinit void cpu_bringup_and_idle(void)
+static __cpuinit void cpu_bringup(void)
 {
        int cpu = smp_processor_id();
 
        cpu_init();
+       touch_softlockup_watchdog();
        preempt_disable();
 
        xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
        local_irq_enable();
 
        wmb();                  /* make sure everything is out */
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+       cpu_bringup();
        cpu_idle();
 }
 
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 
                cpu_set(cpu, cpu_present_map);
        }
-
-       //init_xenbus_allowed_cpumask();
 }
 
 static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
        struct task_struct *idle = idle_task(cpu);
        int rc;
 
-#if 0
-       rc = cpu_up_check(cpu);
-       if (rc)
-               return rc;
-#endif
-
 #ifdef CONFIG_X86_64
        /* Allocate node local memory for AP pdas */
        WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
 {
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int xen_cpu_disable(void)
+{
+       unsigned int cpu = smp_processor_id();
+       if (cpu == 0)
+               return -EBUSY;
+
+       cpu_disable_common();
+
+       load_cr3(swapper_pg_dir);
+       return 0;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+               current->state = TASK_UNINTERRUPTIBLE;
+               schedule_timeout(HZ/10);
+       }
+       unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+       unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+       xen_uninit_lock_cpu(cpu);
+       xen_teardown_timer(cpu);
+
+       if (num_online_cpus() == 1)
+               alternatives_smp_switch(0);
+}
+
+static void xen_play_dead(void)
+{
+       play_dead_common();
+       HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+       cpu_bringup();
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static int xen_cpu_disable(void)
+{
+       return -ENOSYS;
+}
+
+static void xen_cpu_die(unsigned int cpu)
+{
+       BUG();
+}
+
+static void xen_play_dead(void)
+{
+       BUG();
+}
+
+#endif
 static void stop_self(void *v)
 {
        int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-struct xen_spinlock {
-       unsigned char lock;             /* 0 -> free; 1 -> locked */
-       unsigned short spinners;        /* count of waiting cpus */
-};
-
-static int xen_spin_is_locked(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-       return xl->lock != 0;
-}
-
-static int xen_spin_is_contended(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-       /* Not strictly true; this is only the count of contended
-          lock-takers entering the slow path. */
-       return xl->spinners != 0;
-}
-
-static int xen_spin_trylock(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-       u8 old = 1;
-
-       asm("xchgb %b0,%1"
-           : "+q" (old), "+m" (xl->lock) : : "memory");
-
-       return old == 0;
-}
-
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
-
-static inline void spinning_lock(struct xen_spinlock *xl)
-{
-       __get_cpu_var(lock_spinners) = xl;
-       wmb();                  /* set lock of interest before count */
-       asm(LOCK_PREFIX " incw %0"
-           : "+m" (xl->spinners) : : "memory");
-}
-
-static inline void unspinning_lock(struct xen_spinlock *xl)
-{
-       asm(LOCK_PREFIX " decw %0"
-           : "+m" (xl->spinners) : : "memory");
-       wmb();                  /* decrement count before clearing lock */
-       __get_cpu_var(lock_spinners) = NULL;
-}
-
-static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-       int irq = __get_cpu_var(lock_kicker_irq);
-       int ret;
-
-       /* If kicker interrupts not initialized yet, just spin */
-       if (irq == -1)
-               return 0;
-
-       /* announce we're spinning */
-       spinning_lock(xl);
-
-       /* clear pending */
-       xen_clear_irq_pending(irq);
-
-       /* check again make sure it didn't become free while
-          we weren't looking  */
-       ret = xen_spin_trylock(lock);
-       if (ret)
-               goto out;
-
-       /* block until irq becomes pending */
-       xen_poll_irq(irq);
-       kstat_this_cpu.irqs[irq]++;
-
-out:
-       unspinning_lock(xl);
-       return ret;
-}
-
-static void xen_spin_lock(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-       int timeout;
-       u8 oldval;
-
-       do {
-               timeout = 1 << 10;
-
-               asm("1: xchgb %1,%0\n"
-                   "   testb %1,%1\n"
-                   "   jz 3f\n"
-                   "2: rep;nop\n"
-                   "   cmpb $0,%0\n"
-                   "   je 1b\n"
-                   "   dec %2\n"
-                   "   jnz 2b\n"
-                   "3:\n"
-                   : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
-                   : "1" (1)
-                   : "memory");
-
-       } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
-}
-
-static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu) {
-               /* XXX should mix up next cpu selection */
-               if (per_cpu(lock_spinners, cpu) == xl) {
-                       xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
-                       break;
-               }
-       }
-}
-
-static void xen_spin_unlock(struct raw_spinlock *lock)
-{
-       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
-
-       smp_wmb();              /* make sure no writes get moved after unlock */
-       xl->lock = 0;           /* release lock */
-
-       /* make sure unlock happens before kick */
-       barrier();
-
-       if (unlikely(xl->spinners))
-               xen_spin_unlock_slow(xl);
-}
-
-static __cpuinit void xen_init_lock_cpu(int cpu)
-{
-       int irq;
-       const char *name;
-
-       name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
-       irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
-                                    cpu,
-                                    xen_reschedule_interrupt,
-                                    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
-                                    name,
-                                    NULL);
-
-       if (irq >= 0) {
-               disable_irq(irq); /* make sure it's never delivered */
-               per_cpu(lock_kicker_irq, cpu) = irq;
-       }
-
-       printk("cpu %d spinlock event irq %d\n", cpu, irq);
-}
-
-static void __init xen_init_spinlocks(void)
-{
-       pv_lock_ops.spin_is_locked = xen_spin_is_locked;
-       pv_lock_ops.spin_is_contended = xen_spin_is_contended;
-       pv_lock_ops.spin_lock = xen_spin_lock;
-       pv_lock_ops.spin_trylock = xen_spin_trylock;
-       pv_lock_ops.spin_unlock = xen_spin_unlock;
-}
-
 static const struct smp_ops xen_smp_ops __initdata = {
        .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
        .smp_prepare_cpus = xen_smp_prepare_cpus,
-       .cpu_up = xen_cpu_up,
        .smp_cpus_done = xen_smp_cpus_done,
 
+       .cpu_up = xen_cpu_up,
+       .cpu_die = xen_cpu_die,
+       .cpu_disable = xen_cpu_disable,
+       .play_dead = xen_play_dead,
+
        .smp_send_stop = xen_smp_send_stop,
        .smp_send_reschedule = xen_smp_send_reschedule,
 
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644 (file)
index 0000000..dd71e3a
--- /dev/null
@@ -0,0 +1,428 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/kernel_stat.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+
+#include <asm/paravirt.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "debugfs.h"
+
+#ifdef CONFIG_XEN_DEBUG_FS
+static struct xen_spinlock_stats
+{
+       u64 taken;
+       u32 taken_slow;
+       u32 taken_slow_nested;
+       u32 taken_slow_pickup;
+       u32 taken_slow_spurious;
+       u32 taken_slow_irqenable;
+
+       u64 released;
+       u32 released_slow;
+       u32 released_slow_kicked;
+
+#define HISTO_BUCKETS  30
+       u32 histo_spin_total[HISTO_BUCKETS+1];
+       u32 histo_spin_spinning[HISTO_BUCKETS+1];
+       u32 histo_spin_blocked[HISTO_BUCKETS+1];
+
+       u64 time_total;
+       u64 time_spinning;
+       u64 time_blocked;
+} spinlock_stats;
+
+static u8 zero_stats;
+
+static unsigned lock_timeout = 1 << 10;
+#define TIMEOUT lock_timeout
+
+static inline void check_zero(void)
+{
+       if (unlikely(zero_stats)) {
+               memset(&spinlock_stats, 0, sizeof(spinlock_stats));
+               zero_stats = 0;
+       }
+}
+
+#define ADD_STATS(elem, val)                   \
+       do { check_zero(); spinlock_stats.elem += (val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+       return xen_clocksource_read();
+}
+
+static void __spin_time_accum(u64 delta, u32 *array)
+{
+       unsigned index = ilog2(delta);
+
+       check_zero();
+
+       if (index < HISTO_BUCKETS)
+               array[index]++;
+       else
+               array[HISTO_BUCKETS]++;
+}
+
+static inline void spin_time_accum_spinning(u64 start)
+{
+       u32 delta = xen_clocksource_read() - start;
+
+       __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
+       spinlock_stats.time_spinning += delta;
+}
+
+static inline void spin_time_accum_total(u64 start)
+{
+       u32 delta = xen_clocksource_read() - start;
+
+       __spin_time_accum(delta, spinlock_stats.histo_spin_total);
+       spinlock_stats.time_total += delta;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+       u32 delta = xen_clocksource_read() - start;
+
+       __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
+       spinlock_stats.time_blocked += delta;
+}
+#else  /* !CONFIG_XEN_DEBUG_FS */
+#define TIMEOUT                        (1 << 10)
+#define ADD_STATS(elem, val)   do { (void)(val); } while(0)
+
+static inline u64 spin_time_start(void)
+{
+       return 0;
+}
+
+static inline void spin_time_accum_total(u64 start)
+{
+}
+static inline void spin_time_accum_spinning(u64 start)
+{
+}
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif  /* CONFIG_XEN_DEBUG_FS */
+
+struct xen_spinlock {
+       unsigned char lock;             /* 0 -> free; 1 -> locked */
+       unsigned short spinners;        /* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+       return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+       /* Not strictly true; this is only the count of contended
+          lock-takers entering the slow path. */
+       return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+       u8 old = 1;
+
+       asm("xchgb %b0,%1"
+           : "+q" (old), "+m" (xl->lock) : : "memory");
+
+       return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+/*
+ * Mark a cpu as interested in a lock.  Returns the CPU's previous
+ * lock of interest, in case we got preempted by an interrupt.
+ */
+static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
+{
+       struct xen_spinlock *prev;
+
+       prev = __get_cpu_var(lock_spinners);
+       __get_cpu_var(lock_spinners) = xl;
+
+       wmb();                  /* set lock of interest before count */
+
+       asm(LOCK_PREFIX " incw %0"
+           : "+m" (xl->spinners) : : "memory");
+
+       return prev;
+}
+
+/*
+ * Mark a cpu as no longer interested in a lock.  Restores previous
+ * lock of interest (NULL for none).
+ */
+static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
+{
+       asm(LOCK_PREFIX " decw %0"
+           : "+m" (xl->spinners) : : "memory");
+       wmb();                  /* decrement count before restoring lock */
+       __get_cpu_var(lock_spinners) = prev;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
+{
+       struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+       struct xen_spinlock *prev;
+       int irq = __get_cpu_var(lock_kicker_irq);
+       int ret;
+       unsigned long flags;
+       u64 start;
+
+       /* If kicker interrupts not initialized yet, just spin */
+       if (irq == -1)
+               return 0;
+
+       start = spin_time_start();
+
+       /* announce we're spinning */
+       prev = spinning_lock(xl);
+
+       flags = __raw_local_save_flags();
+       if (irq_enable) {
+               ADD_STATS(taken_slow_irqenable, 1);
+               raw_local_irq_enable();
+       }
+
+       ADD_STATS(taken_slow, 1);
+       ADD_STATS(taken_slow_nested, prev != NULL);
+
+       do {
+               /* clear pending */
+               xen_clear_irq_pending(irq);
+
+               /* check again make sure it didn't become free while
+        &n