Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 17 Oct 2007 18:10:11 +0000 (11:10 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 17 Oct 2007 18:10:11 +0000 (11:10 -0700)
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen:
  xfs: eagerly remove vmap mappings to avoid upsetting Xen
  xen: add some debug output for failed multicalls
  xen: fix incorrect vcpu_register_vcpu_info hypercall argument
  xen: ask the hypervisor how much space it needs reserved
  xen: lock pte pages while pinning/unpinning
  xen: deal with stale cr3 values when unpinning pagetables
  xen: add batch completion callbacks
  xen: yield to IPI target if necessary
  Clean up duplicate includes in arch/i386/xen/
  remove dead code in pgtable_cache_init
  paravirt: clean up lazy mode handling
  paravirt: refactor struct paravirt_ops into smaller pv_*_ops

1  2 
arch/x86/kernel/alternative.c
arch/x86/kernel/entry_32.S
arch/x86/mm/init_32.c
arch/x86/xen/enlighten.c
arch/x86/xen/smp.c
drivers/lguest/lguest.c
mm/Kconfig

index 11b03d3c6fdaf36a65f2d86aa769eddd44e914e6,63c55148dd05d8563ed4aed4abcc945f6137e52c..42421437ded310a4ea8bb75239639e274427c1c6
@@@ -10,7 -10,6 +10,7 @@@
  #include <asm/pgtable.h>
  #include <asm/mce.h>
  #include <asm/nmi.h>
 +#include <asm/vsyscall.h>
  
  #define MAX_PATCH_LEN (255-1)
  
@@@ -369,8 -368,8 +369,8 @@@ void apply_paravirt(struct paravirt_pat
                BUG_ON(p->len > MAX_PATCH_LEN);
                /* prep the buffer with the original instructions */
                memcpy(insnbuf, p->instr, p->len);
-               used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
-                                         (unsigned long)p->instr, p->len);
+               used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+                                        (unsigned long)p->instr, p->len);
  
                BUG_ON(used > p->len);
  
index 8099fea0a72f1a959362f9a17763ee534d6316c2,1f2062e94d821c32002cd8e3fb43caa3b8908cb3..dc7f938e501501a74a3a97d943b5014795bcba49
@@@ -251,7 -251,6 +251,7 @@@ check_userspace
        jb resume_kernel                # not returning to v8086 or userspace
  
  ENTRY(resume_userspace)
 +      LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
@@@ -339,7 -338,6 +339,7 @@@ sysenter_past_esp
        jae syscall_badsys
        call *sys_call_table(,%eax,4)
        movl %eax,PT_EAX(%esp)
 +      LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
@@@ -379,7 -377,6 +379,7 @@@ syscall_call
        call *sys_call_table(,%eax,4)
        movl %eax,PT_EAX(%esp)          # store the return value
  syscall_exit:
 +      LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
@@@ -437,7 -434,7 +437,7 @@@ ldt_ss
         * is still available to implement the setting of the high
         * 16-bits in the INTERRUPT_RETURN paravirt-op.
         */
-       cmpl $0, paravirt_ops+PARAVIRT_enabled
+       cmpl $0, pv_info+PARAVIRT_enabled
        jne restore_nocheck
  #endif
  
@@@ -470,7 -467,6 +470,7 @@@ work_pending
        jz work_notifysig
  work_resched:
        call schedule
 +      LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
diff --combined arch/x86/mm/init_32.c
index dda4e83649a0b778c6e16859f3660afc53b01f86,aa4708fda3e216b30bd8a662cb3b0460b973a359..33d367a3432ebfbafbe0294872a8a20a9547e456
@@@ -735,30 -735,23 +735,18 @@@ int arch_add_memory(int nid, u64 start
        return __add_pages(zone, start_pfn, nr_pages);
  }
  
 -int remove_memory(u64 start, u64 size)
 -{
 -      return -EINVAL;
 -}
 -EXPORT_SYMBOL_GPL(remove_memory);
  #endif
  
  struct kmem_cache *pmd_cache;
  
  void __init pgtable_cache_init(void)
  {
-       size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
-       if (PTRS_PER_PMD > 1) {
+       if (PTRS_PER_PMD > 1)
                pmd_cache = kmem_cache_create("pmd",
-                                       PTRS_PER_PMD*sizeof(pmd_t),
-                                       PTRS_PER_PMD*sizeof(pmd_t),
-                                       SLAB_PANIC,
-                                       pmd_ctor);
-               if (!SHARED_KERNEL_PMD) {
-                       /* If we're in PAE mode and have a non-shared
-                          kernel pmd, then the pgd size must be a
-                          page size.  This is because the pgd_list
-                          links through the page structure, so there
-                          can only be one pgd per page for this to
-                          work. */
-                       pgd_size = PAGE_SIZE;
-               }
-       }
+                                             PTRS_PER_PMD*sizeof(pmd_t),
+                                             PTRS_PER_PMD*sizeof(pmd_t),
+                                             SLAB_PANIC,
+                                             pmd_ctor);
  }
  
  /*
diff --combined arch/x86/xen/enlighten.c
index 493a083f6886bd4ae797bb6d93911f3236804fd1,c89e5b407f908f2f00d9b2f966b58a87f7b9e4b8..94c39aaf695facf3d3937633306bb7e5b17d41fd
@@@ -25,7 -25,6 +25,6 @@@
  #include <linux/mm.h>
  #include <linux/page-flags.h>
  #include <linux/highmem.h>
- #include <linux/smp.h>
  
  #include <xen/interface/xen.h>
  #include <xen/interface/physdev.h>
  
  EXPORT_SYMBOL_GPL(hypercall_page);
  
- DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
  DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
  DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
- DEFINE_PER_CPU(unsigned long, xen_cr3);
+ /*
+  * Note about cr3 (pagetable base) values:
+  *
+  * xen_cr3 contains the current logical cr3 value; it contains the
+  * last set cr3.  This may not be the current effective cr3, because
+  * its update may be being lazily deferred.  However, a vcpu looking
+  * at its own cr3 can use this value knowing that it everything will
+  * be self-consistent.
+  *
+  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+  * hypercall to set the vcpu cr3 is complete (so it may be a little
+  * out of date, but it will never be set early).  If one vcpu is
+  * looking at another vcpu's cr3 value, it should use this variable.
+  */
+ DEFINE_PER_CPU(unsigned long, xen_cr3);        /* cr3 stored as physaddr */
+ DEFINE_PER_CPU(unsigned long, xen_current_cr3);        /* actual vcpu cr3 */
  
  struct start_info *xen_start_info;
  EXPORT_SYMBOL_GPL(xen_start_info);
@@@ -100,7 -113,7 +113,7 @@@ static void __init xen_vcpu_setup(int c
        info.mfn = virt_to_mfn(vcpup);
        info.offset = offset_in_page(vcpup);
  
-       printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+       printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
               cpu, vcpup, info.mfn, info.offset);
  
        /* Check to see if the hypervisor will put the vcpu_info
  static void __init xen_banner(void)
  {
        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-              paravirt_ops.name);
+              pv_info.name);
        printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
  }
  
@@@ -249,29 -262,10 +262,10 @@@ static void xen_halt(void
                xen_safe_halt();
  }
  
- static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+ static void xen_leave_lazy(void)
  {
-       BUG_ON(preemptible());
-       switch (mode) {
-       case PARAVIRT_LAZY_NONE:
-               BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
-               break;
-       case PARAVIRT_LAZY_MMU:
-       case PARAVIRT_LAZY_CPU:
-               BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
-               break;
-       case PARAVIRT_LAZY_FLUSH:
-               /* flush if necessary, but don't change state */
-               if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
-                       xen_mc_flush();
-               return;
-       }
+       paravirt_leave_lazy(paravirt_get_lazy_mode());
        xen_mc_flush();
-       x86_write_percpu(xen_lazy_mode, mode);
  }
  
  static unsigned long xen_store_tr(void)
@@@ -358,7 -352,7 +352,7 @@@ static void xen_load_tls(struct thread_
         * loaded properly.  This will go away as soon as Xen has been
         * modified to not save/restore %gs for normal hypercalls.
         */
-       if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
                loadsegment(gs, 0);
  }
  
@@@ -632,32 -626,36 +626,36 @@@ static unsigned long xen_read_cr3(void
        return x86_read_percpu(xen_cr3);
  }
  
+ static void set_current_cr3(void *v)
+ {
+       x86_write_percpu(xen_current_cr3, (unsigned long)v);
+ }
  static void xen_write_cr3(unsigned long cr3)
  {
+       struct mmuext_op *op;
+       struct multicall_space mcs;
+       unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
        BUG_ON(preemptible());
  
-       if (cr3 == x86_read_percpu(xen_cr3)) {
-               /* just a simple tlb flush */
-               xen_flush_tlb();
-               return;
-       }
+       mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
  
+       /* Update while interrupts are disabled, so its atomic with
+          respect to ipis */
        x86_write_percpu(xen_cr3, cr3);
  
+       op = mcs.args;
+       op->cmd = MMUEXT_NEW_BASEPTR;
+       op->arg1.mfn = mfn;
  
-       {
-               struct mmuext_op *op;
-               struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-               unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-               op = mcs.args;
-               op->cmd = MMUEXT_NEW_BASEPTR;
-               op->arg1.mfn = mfn;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
  
-               MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+       /* Update xen_update_cr3 once the batch has actually
+          been submitted. */
+       xen_mc_callback(set_current_cr3, (void *)cr3);
  
-               xen_mc_issue(PARAVIRT_LAZY_CPU);
-       }
+       xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
  }
  
  /* Early in boot, while setting up the initial pagetable, assume
@@@ -668,6 -666,15 +666,15 @@@ static __init void xen_alloc_pt_init(st
        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
  }
  
+ static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
+ {
+       struct mmuext_op op;
+       op.cmd = level;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+               BUG();
+ }
  /* This needs to make sure the new pte page is pinned iff its being
     attached to a pinned pagetable. */
  static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
        if (PagePinned(virt_to_page(mm->pgd))) {
                SetPagePinned(page);
  
-               if (!PageHighMem(page))
+               if (!PageHighMem(page)) {
                        make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-               else
+                       pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+               } else
                        /* make sure there are no stray mappings of
                           this page */
                        kmap_flush_unused();
@@@ -692,8 -700,10 +700,10 @@@ static void xen_release_pt(u32 pfn
        struct page *page = pfn_to_page(pfn);
  
        if (PagePinned(page)) {
-               if (!PageHighMem(page))
+               if (!PageHighMem(page)) {
+                       pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
                        make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+               }
        }
  }
  
@@@ -738,7 -748,7 +748,7 @@@ static __init void xen_pagetable_setup_
        pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
  
        /* special set_pte for pagetable initialization */
-       paravirt_ops.set_pte = xen_set_pte_init;
+       pv_mmu_ops.set_pte = xen_set_pte_init;
  
        init_mm.pgd = base;
        /*
@@@ -785,8 -795,8 +795,8 @@@ static __init void xen_pagetable_setup_
  {
        /* This will work as long as patching hasn't happened yet
           (which it hasn't) */
-       paravirt_ops.alloc_pt = xen_alloc_pt;
-       paravirt_ops.set_pte = xen_set_pte;
+       pv_mmu_ops.alloc_pt = xen_alloc_pt;
+       pv_mmu_ops.set_pte = xen_set_pte;
  
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
                /*
        /* Actually pin the pagetable down, but we can't set PG_pinned
           yet because the page structures don't exist yet. */
        {
-               struct mmuext_op op;
+               unsigned level;
  #ifdef CONFIG_X86_PAE
-               op.cmd = MMUEXT_PIN_L3_TABLE;
+               level = MMUEXT_PIN_L3_TABLE;
  #else
-               op.cmd = MMUEXT_PIN_L3_TABLE;
+               level = MMUEXT_PIN_L2_TABLE;
  #endif
-               op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
-               if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-                       BUG();
+               pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
        }
  }
  
@@@ -833,12 -843,12 +843,12 @@@ void __init xen_setup_vcpu_info_placeme
        if (have_vcpu_info_placement) {
                printk(KERN_INFO "Xen: using vcpu_info placement\n");
  
-               paravirt_ops.save_fl = xen_save_fl_direct;
-               paravirt_ops.restore_fl = xen_restore_fl_direct;
-               paravirt_ops.irq_disable = xen_irq_disable_direct;
-               paravirt_ops.irq_enable = xen_irq_enable_direct;
-               paravirt_ops.read_cr2 = xen_read_cr2_direct;
-               paravirt_ops.iret = xen_iret_direct;
+               pv_irq_ops.save_fl = xen_save_fl_direct;
+               pv_irq_ops.restore_fl = xen_restore_fl_direct;
+               pv_irq_ops.irq_disable = xen_irq_disable_direct;
+               pv_irq_ops.irq_enable = xen_irq_enable_direct;
+               pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+               pv_cpu_ops.iret = xen_iret_direct;
        }
  }
  
@@@ -850,8 -860,8 +860,8 @@@ static unsigned xen_patch(u8 type, u16 
  
        start = end = reloc = NULL;
  
- #define SITE(x)                                                               \
-       case PARAVIRT_PATCH(x):                                         \
+ #define SITE(op, x)                                                   \
+       case PARAVIRT_PATCH(op.x):                                      \
        if (have_vcpu_info_placement) {                                 \
                start = (char *)xen_##x##_direct;                       \
                end = xen_##x##_direct_end;                             \
        goto patch_site
  
        switch (type) {
-               SITE(irq_enable);
-               SITE(irq_disable);
-               SITE(save_fl);
-               SITE(restore_fl);
+               SITE(pv_irq_ops, irq_enable);
+               SITE(pv_irq_ops, irq_disable);
+               SITE(pv_irq_ops, save_fl);
+               SITE(pv_irq_ops, restore_fl);
  #undef SITE
  
        patch_site:
        return ret;
  }
  
- static const struct paravirt_ops xen_paravirt_ops __initdata = {
+ static const struct pv_info xen_info __initdata = {
        .paravirt_enabled = 1,
        .shared_kernel_pmd = 0,
  
        .name = "Xen",
-       .banner = xen_banner,
+ };
  
+ static const struct pv_init_ops xen_init_ops __initdata = {
        .patch = xen_patch,
  
+       .banner = xen_banner,
        .memory_setup = xen_memory_setup,
        .arch_setup = xen_arch_setup,
-       .init_IRQ = xen_init_IRQ,
        .post_allocator_init = xen_mark_init_mm_pinned,
+ };
  
+ static const struct pv_time_ops xen_time_ops __initdata = {
        .time_init = xen_time_init,
        .set_wallclock = xen_set_wallclock,
        .get_wallclock = xen_get_wallclock,
        .get_cpu_khz = xen_cpu_khz,
        .sched_clock = xen_sched_clock,
+ };
  
+ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .cpuid = xen_cpuid,
  
        .set_debugreg = xen_set_debugreg,
        .read_cr0 = native_read_cr0,
        .write_cr0 = native_write_cr0,
  
-       .read_cr2 = xen_read_cr2,
-       .write_cr2 = xen_write_cr2,
-       .read_cr3 = xen_read_cr3,
-       .write_cr3 = xen_write_cr3,
        .read_cr4 = native_read_cr4,
        .read_cr4_safe = native_read_cr4_safe,
        .write_cr4 = xen_write_cr4,
  
-       .save_fl = xen_save_fl,
-       .restore_fl = xen_restore_fl,
-       .irq_disable = xen_irq_disable,
-       .irq_enable = xen_irq_enable,
-       .safe_halt = xen_safe_halt,
-       .halt = xen_halt,
        .wbinvd = native_wbinvd,
  
        .read_msr = native_read_msr_safe,
        .set_iopl_mask = xen_set_iopl_mask,
        .io_delay = xen_io_delay,
  
+       .lazy_mode = {
+               .enter = paravirt_enter_lazy_cpu,
+               .leave = xen_leave_lazy,
+       },
+ };
+ static const struct pv_irq_ops xen_irq_ops __initdata = {
+       .init_IRQ = xen_init_IRQ,
+       .save_fl = xen_save_fl,
+       .restore_fl = xen_restore_fl,
+       .irq_disable = xen_irq_disable,
+       .irq_enable = xen_irq_enable,
+       .safe_halt = xen_safe_halt,
+       .halt = xen_halt,
+ };
+ static const struct pv_apic_ops xen_apic_ops __initdata = {
  #ifdef CONFIG_X86_LOCAL_APIC
        .apic_write = xen_apic_write,
        .apic_write_atomic = xen_apic_write,
        .setup_secondary_clock = paravirt_nop,
        .startup_ipi_hook = paravirt_nop,
  #endif
+ };
+ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+       .pagetable_setup_start = xen_pagetable_setup_start,
+       .pagetable_setup_done = xen_pagetable_setup_done,
+       .read_cr2 = xen_read_cr2,
+       .write_cr2 = xen_write_cr2,
+       .read_cr3 = xen_read_cr3,
+       .write_cr3 = xen_write_cr3,
  
        .flush_tlb_user = xen_flush_tlb,
        .flush_tlb_kernel = xen_flush_tlb,
        .pte_update = paravirt_nop,
        .pte_update_defer = paravirt_nop,
  
-       .pagetable_setup_start = xen_pagetable_setup_start,
-       .pagetable_setup_done = xen_pagetable_setup_done,
        .alloc_pt = xen_alloc_pt_init,
        .release_pt = xen_release_pt,
        .alloc_pd = paravirt_nop,
        .dup_mmap = xen_dup_mmap,
        .exit_mmap = xen_exit_mmap,
  
-       .set_lazy_mode = xen_set_lazy_mode,
+       .lazy_mode = {
+               .enter = paravirt_enter_lazy_mmu,
+               .leave = xen_leave_lazy,
+       },
  };
  
  #ifdef CONFIG_SMP
@@@ -1080,6 -1112,17 +1112,17 @@@ static const struct machine_ops __initd
  };
  
  
+ static void __init xen_reserve_top(void)
+ {
+       unsigned long top = HYPERVISOR_VIRT_START;
+       struct xen_platform_parameters pp;
+       if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+               top = pp.virt_start;
+       reserve_top_address(-top + 2 * PAGE_SIZE);
+ }
  /* First C function to be called on Xen boot */
  asmlinkage void __init xen_start_kernel(void)
  {
        BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
  
        /* Install Xen paravirt ops */
-       paravirt_ops = xen_paravirt_ops;
+       pv_info = xen_info;
+       pv_init_ops = xen_init_ops;
+       pv_time_ops = xen_time_ops;
+       pv_cpu_ops = xen_cpu_ops;
+       pv_irq_ops = xen_irq_ops;
+       pv_apic_ops = xen_apic_ops;
+       pv_mmu_ops = xen_mmu_ops;
        machine_ops = xen_machine_ops;
  
  #ifdef CONFIG_SMP
        /* keep using Xen gdt for now; no urgent need to change it */
  
        x86_write_percpu(xen_cr3, __pa(pgd));
+       x86_write_percpu(xen_current_cr3, __pa(pgd));
  
  #ifdef CONFIG_SMP
        /* Don't do the full vcpu_info placement stuff until we have a
        xen_setup_vcpu_info_placement();
  #endif
  
-       paravirt_ops.kernel_rpl = 1;
+       pv_info.kernel_rpl = 1;
        if (xen_feature(XENFEAT_supervisor_mode_kernel))
-               paravirt_ops.kernel_rpl = 0;
+               pv_info.kernel_rpl = 0;
  
        /* set the limit of our address space */
-       reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+       xen_reserve_top();
  
        /* set up basic CPUID stuff */
        cpu_detect(&new_cpu_data);
        new_cpu_data.x86_capability[0] = cpuid_edx(1);
  
        /* Poke various useful things into boot_params */
 -      LOADER_TYPE = (9 << 4) | 0;
 -      INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
 -      INITRD_SIZE = xen_start_info->mod_len;
 +      boot_params.hdr.type_of_loader = (9 << 4) | 0;
 +      boot_params.hdr.ramdisk_image = xen_start_info->mod_start
 +              ? __pa(xen_start_info->mod_start) : 0;
 +      boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
  
        /* Start the world */
        start_kernel();
diff --combined arch/x86/xen/smp.c
index 4fa33c27ccb6d4d4e6a7619f7bd8c46c25f36a2c,865953e6f341ef7da3699106684f5803553209cf..d53bf9d8a72d083ed0274860c042be3ec6ff3c2e
@@@ -147,13 -147,8 +147,13 @@@ void __init xen_smp_prepare_boot_cpu(vo
        make_lowmem_page_readwrite(&per_cpu__gdt_page);
  
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 -              cpus_clear(cpu_sibling_map[cpu]);
 -              cpus_clear(cpu_core_map[cpu]);
 +              cpus_clear(per_cpu(cpu_sibling_map, cpu));
 +              /*
 +               * cpu_core_map lives in a per cpu area that is cleared
 +               * when the per cpu array is allocated.
 +               *
 +               * cpus_clear(per_cpu(cpu_core_map, cpu));
 +               */
        }
  
        xen_setup_vcpu_info_placement();
@@@ -164,13 -159,8 +164,13 @@@ void __init xen_smp_prepare_cpus(unsign
        unsigned cpu;
  
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 -              cpus_clear(cpu_sibling_map[cpu]);
 -              cpus_clear(cpu_core_map[cpu]);
 +              cpus_clear(per_cpu(cpu_sibling_map, cpu));
 +              /*
 +               * cpu_core_ map will be zeroed when the per
 +               * cpu area is allocated.
 +               *
 +               * cpus_clear(per_cpu(cpu_core_map, cpu));
 +               */
        }
  
        smp_store_cpu_info(0);
@@@ -370,7 -360,8 +370,8 @@@ int xen_smp_call_function_mask(cpumask_
                               void *info, int wait)
  {
        struct call_data_struct data;
-       int cpus;
+       int cpus, cpu;
+       bool yield;
  
        /* Holding any lock stops cpus from going down. */
        spin_lock(&call_lock);
        /* Send a message to other CPUs and wait for them to respond */
        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
  
-       /* Make sure other vcpus get a chance to run.
-          XXX too severe?  Maybe we should check the other CPU's states? */
-       HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+       /* Make sure other vcpus get a chance to run if they need to. */
+       yield = false;
+       for_each_cpu_mask(cpu, mask)
+               if (xen_vcpu_stolen(cpu))
+                       yield = true;
+       if (yield)
+               HYPERVISOR_sched_op(SCHEDOP_yield, 0);
  
        /* Wait for response */
        while (atomic_read(&data.started) != cpus ||
diff --combined drivers/lguest/lguest.c
index 4a579c840301fe77539af414e4c4086d97bba5ca,c302629e0895d13501953b2590b07c65f7dbb0af..3ba337dde8575e1570ed9f0ee77d0c9170247efc
@@@ -23,7 -23,7 +23,7 @@@
   *
   * So how does the kernel know it's a Guest?  The Guest starts at a special
   * entry point marked with a magic string, which sets up a few things then
-  * calls here.  We replace the native functions in "struct paravirt_ops"
+  * calls here.  We replace the native functions various "paravirt" structures
   * with our Guest versions, then boot like normal. :*/
  
  /*
@@@ -97,29 -97,17 +97,17 @@@ static cycle_t clock_base
   * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
   * are reasonably expensive, batching them up makes sense.  For example, a
   * large mmap might update dozens of page table entries: that code calls
-  * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
-  * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
+  * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
+  * lguest_leave_lazy_mode().
   *
   * So, when we're in lazy mode, we call async_hypercall() to store the call for
   * future processing.  When lazy mode is turned off we issue a hypercall to
   * flush the stored calls.
-  *
-  * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
-  * indicates we're to flush any outstanding calls immediately.  This is used
-  * when an interrupt handler does a kmap_atomic(): the page table changes must
-  * happen immediately even if we're in the middle of a batch.  Usually we're
-  * not, though, so there's nothing to do. */
- static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
- static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
+  */
+ static void lguest_leave_lazy_mode(void)
  {
-       if (mode == PARAVIRT_LAZY_FLUSH) {
-               if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))
-                       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-       } else {
-               lazy_mode = mode;
-               if (mode == PARAVIRT_LAZY_NONE)
-                       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-       }
+       paravirt_leave_lazy(paravirt_get_lazy_mode());
+       hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
  }
  
  static void lazy_hcall(unsigned long call,
                       unsigned long arg2,
                       unsigned long arg3)
  {
-       if (lazy_mode == PARAVIRT_LAZY_NONE)
+       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
                hcall(call, arg1, arg2, arg3);
        else
                async_hcall(call, arg1, arg2, arg3);
@@@ -331,7 -319,7 +319,7 @@@ static void lguest_load_tls(struct thre
  }
  
  /*G:038 That's enough excitement for now, back to ploughing through each of
-  * the paravirt_ops (we're about 1/3 of the way through).
+  * the different pv_ops structures (we're about 1/3 of the way through).
   *
   * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
   * uses this for some strange applications like Wine.  We don't do anything
@@@ -558,7 -546,7 +546,7 @@@ static void lguest_set_pte(pte_t *ptep
                lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
  }
  
- /* Unfortunately for Lguest, the paravirt_ops for page tables were based on
+ /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
   * native page table operations.  On native hardware you can set a new page
   * table entry whenever you want, but if you want to remove one you have to do
   * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@@ -782,7 -770,7 +770,7 @@@ static void lguest_time_init(void
        clocksource_register(&lguest_clock);
  
        /* Now we've set up our clock, we can use it as the scheduler clock */
-       paravirt_ops.sched_clock = lguest_sched_clock;
+       pv_time_ops.sched_clock = lguest_sched_clock;
  
        /* We can't set cpumask in the initializer: damn C limitations!  Set it
         * here and register our timer device. */
@@@ -893,9 -881,7 +881,9 @@@ static __init char *lguest_memory_setup
  
        /* The Linux bootloader header contains an "e820" memory map: the
         * Launcher populated the first entry with our memory limit. */
 -      add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
 +      add_memory_region(boot_params.e820_map[0].addr,
 +                        boot_params.e820_map[0].size,
 +                        boot_params.e820_map[0].type);
  
        /* This string is for the boot messages. */
        return "LGUEST";
  /*G:050
   * Patching (Powerfully Placating Performance Pedants)
   *
-  * We have already seen that "struct paravirt_ops" lets us replace simple
+  * We have already seen that pv_ops structures let us replace simple
   * native instructions with calls to the appropriate back end all throughout
   * the kernel.  This allows the same kernel to run as a Guest and as a native
   * kernel, but it's slow because of all the indirect branches.
@@@ -929,10 -915,10 +917,10 @@@ static const struct lguest_insn
  {
        const char *start, *end;
  } lguest_insns[] = {
-       [PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
-       [PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
-       [PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
-       [PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+       [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
+       [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
+       [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
+       [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
  };
  
  /* Now our patch routine is fairly simple (based on the native one in
@@@ -959,9 -945,9 +947,9 @@@ static unsigned lguest_patch(u8 type, u
        return insn_len;
  }
  
- /*G:030 Once we get to lguest_init(), we know we're a Guest.  The paravirt_ops
-  * structure in the kernel provides a single point for (almost) every routine
-  * we have to override to avoid privileged instructions. */
+ /*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
+  * structures in the kernel provide points for (almost) every routine we have
+  * to override to avoid privileged instructions. */
  __init void lguest_init(void *boot)
  {
        /* Copy boot parameters first: the Launcher put the physical location
  
        /* We're under lguest, paravirt is enabled, and we're running at
         * privilege level 1, not 0 as normal. */
-       paravirt_ops.name = "lguest";
-       paravirt_ops.paravirt_enabled = 1;
-       paravirt_ops.kernel_rpl = 1;
+       pv_info.name = "lguest";
+       pv_info.paravirt_enabled = 1;
+       pv_info.kernel_rpl = 1;
  
        /* We set up all the lguest overrides for sensitive operations.  These
         * are detailed with the operations themselves. */
-       paravirt_ops.save_fl = save_fl;
-       paravirt_ops.restore_fl = restore_fl;
-       paravirt_ops.irq_disable = irq_disable;
-       paravirt_ops.irq_enable = irq_enable;
-       paravirt_ops.load_gdt = lguest_load_gdt;
-       paravirt_ops.memory_setup = lguest_memory_setup;
-       paravirt_ops.cpuid = lguest_cpuid;
-       paravirt_ops.write_cr3 = lguest_write_cr3;
-       paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
-       paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
-       paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-       paravirt_ops.set_pte = lguest_set_pte;
-       paravirt_ops.set_pte_at = lguest_set_pte_at;
-       paravirt_ops.set_pmd = lguest_set_pmd;
+       /* interrupt-related operations */
+       pv_irq_ops.init_IRQ = lguest_init_IRQ;
+       pv_irq_ops.save_fl = save_fl;
+       pv_irq_ops.restore_fl = restore_fl;
+       pv_irq_ops.irq_disable = irq_disable;
+       pv_irq_ops.irq_enable = irq_enable;
+       pv_irq_ops.safe_halt = lguest_safe_halt;
+       /* init-time operations */
+       pv_init_ops.memory_setup = lguest_memory_setup;
+       pv_init_ops.patch = lguest_patch;
+       /* Intercepts of various cpu instructions */
+       pv_cpu_ops.load_gdt = lguest_load_gdt;
+       pv_cpu_ops.cpuid = lguest_cpuid;
+       pv_cpu_ops.load_idt = lguest_load_idt;
+       pv_cpu_ops.iret = lguest_iret;
+       pv_cpu_ops.load_esp0 = lguest_load_esp0;
+       pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
+       pv_cpu_ops.set_ldt = lguest_set_ldt;
+       pv_cpu_ops.load_tls = lguest_load_tls;
+       pv_cpu_ops.set_debugreg = lguest_set_debugreg;
+       pv_cpu_ops.clts = lguest_clts;
+       pv_cpu_ops.read_cr0 = lguest_read_cr0;
+       pv_cpu_ops.write_cr0 = lguest_write_cr0;
+       pv_cpu_ops.read_cr4 = lguest_read_cr4;
+       pv_cpu_ops.write_cr4 = lguest_write_cr4;
+       pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
+       pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
+       pv_cpu_ops.wbinvd = lguest_wbinvd;
+       pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
+       pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+       /* pagetable management */
+       pv_mmu_ops.write_cr3 = lguest_write_cr3;
+       pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
+       pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
+       pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+       pv_mmu_ops.set_pte = lguest_set_pte;
+       pv_mmu_ops.set_pte_at = lguest_set_pte_at;
+       pv_mmu_ops.set_pmd = lguest_set_pmd;
+       pv_mmu_ops.read_cr2 = lguest_read_cr2;
+       pv_mmu_ops.read_cr3 = lguest_read_cr3;
+       pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
+       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
  #ifdef CONFIG_X86_LOCAL_APIC
-       paravirt_ops.apic_write = lguest_apic_write;
-       paravirt_ops.apic_write_atomic = lguest_apic_write;
-       paravirt_ops.apic_read = lguest_apic_read;
+       /* apic read/write intercepts */
+       pv_apic_ops.apic_write = lguest_apic_write;
+       pv_apic_ops.apic_write_atomic = lguest_apic_write;
+       pv_apic_ops.apic_read = lguest_apic_read;
  #endif
-       paravirt_ops.load_idt = lguest_load_idt;
-       paravirt_ops.iret = lguest_iret;
-       paravirt_ops.load_esp0 = lguest_load_esp0;
-       paravirt_ops.load_tr_desc = lguest_load_tr_desc;
-       paravirt_ops.set_ldt = lguest_set_ldt;
-       paravirt_ops.load_tls = lguest_load_tls;
-       paravirt_ops.set_debugreg = lguest_set_debugreg;
-       paravirt_ops.clts = lguest_clts;
-       paravirt_ops.read_cr0 = lguest_read_cr0;
-       paravirt_ops.write_cr0 = lguest_write_cr0;
-       paravirt_ops.init_IRQ = lguest_init_IRQ;
-       paravirt_ops.read_cr2 = lguest_read_cr2;
-       paravirt_ops.read_cr3 = lguest_read_cr3;
-       paravirt_ops.read_cr4 = lguest_read_cr4;
-       paravirt_ops.write_cr4 = lguest_write_cr4;
-       paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
-       paravirt_ops.write_idt_entry = lguest_write_idt_entry;
-       paravirt_ops.patch = lguest_patch;
-       paravirt_ops.safe_halt = lguest_safe_halt;
-       paravirt_ops.get_wallclock = lguest_get_wallclock;
-       paravirt_ops.time_init = lguest_time_init;
-       paravirt_ops.set_lazy_mode = lguest_lazy_mode;
-       paravirt_ops.wbinvd = lguest_wbinvd;
+       /* time operations */
+       pv_time_ops.get_wallclock = lguest_get_wallclock;
+       pv_time_ops.time_init = lguest_time_init;
        /* Now is a good time to look at the implementations of these functions
         * before returning to the rest of lguest_init(). */
  
diff --combined mm/Kconfig
index 1cc6cada2bbfa837ed0f37ec292f4657125cec68,e24d348083c34da69472bc22a9b7025d562a3ffe..b1f03b0eb7f13e5318535ef1ef799783803e3acd
@@@ -112,19 -112,6 +112,19 @@@ config SPARSEMEM_EXTREM
        def_bool y
        depends on SPARSEMEM && !SPARSEMEM_STATIC
  
 +#
 +# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
 +# and page_to_pfn.  The most efficient option where kernel virtual space is
 +# not under pressure.
 +#
 +config SPARSEMEM_VMEMMAP_ENABLE
 +      def_bool n
 +
 +config SPARSEMEM_VMEMMAP
 +      bool
 +      depends on SPARSEMEM
 +      default y if (SPARSEMEM_VMEMMAP_ENABLE)
 +
  # eventually, we can have this option just 'select SPARSEMEM'
  config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@@ -139,11 -126,6 +139,11 @@@ config MEMORY_HOTPLUG_SPARS
        def_bool y
        depends on SPARSEMEM && MEMORY_HOTPLUG
  
 +config MEMORY_HOTREMOVE
 +      bool "Allow for memory hot remove"
 +      depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 +      depends on MIGRATION
 +
  # Heavily threaded applications may benefit from splitting the mm-wide
  # page_table_lock, so that faults on different parts of the user address
  # space can be handled with less contention: split it at this NR_CPUS.
@@@ -155,7 -137,6 +155,6 @@@ config SPLIT_PTLOCK_CPU
        int
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && !PA20
-       default "4096" if XEN
        default "4"
  
  #