Merge branch 'for-2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

[linux-2.6.git] / arch / x86 / kernel / smpboot.c
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 37462f1ddba54d1ad3af14a682efb1e71dc6df96..c7149c96d0795f592d2c24252ef471bb310aa471 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
  #include <asm/apic.h>
  #include <asm/setup.h>
  #include <asm/uv/uv.h>
@@ -73,7 +73,6 @@
  
  #ifdef CONFIG_X86_32
  u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
  #endif
  
  /* State of each CPU */
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
  static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
  #define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
  #define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
+
+/*
+ * We need this for trampoline_base protection from concurrent accesses when
+ * off- and onlining cores wildly.
+ */
+static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
+
+void cpu_hotplug_driver_lock()
+{
+        mutex_lock(&x86_cpu_hotplug_driver_mutex);
+}
+
+void cpu_hotplug_driver_unlock()
+{
+        mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+}
+
+ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
+ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
  #else
  static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
  #define get_idle_for_cpu(x)      (idle_thread_array[(x)])
@@ -263,6 +281,13 @@ static void __cpuinit smp_callin(void)
          */
         smp_store_cpu_info(cpuid);
  
+       /*
+        * This must be done before setting cpu_online_mask
+        * or calling notify_cpu_starting.
+        */
+       set_cpu_sibling_map(raw_smp_processor_id());
+       wmb();
+
         notify_cpu_starting(cpuid);
  
         /*
@@ -281,11 +306,16 @@ notrace static void __cpuinit start_secondary(void *unused)
          * fragile that we want to limit the things done here to the
          * most necessary things.
          */
-       vmi_bringup();
         cpu_init();
         preempt_disable();
         smp_callin();
  
+#ifdef CONFIG_X86_32
+       /* switch away from the initial page table */
+       load_cr3(swapper_pg_dir);
+       __flush_tlb_all();
+#endif
+
         /* otherwise gcc will move up smp_processor_id before the cpu_init */
         barrier();
         /*
@@ -293,22 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
          */
         check_tsc_sync_target();
  
-       if (nmi_watchdog == NMI_IO_APIC) {
-               legacy_pic->chip->mask(0);
-               enable_NMI_through_LVT0();
-               legacy_pic->chip->unmask(0);
-       }
-
-#ifdef CONFIG_X86_32
-       while (low_mappings)
-               cpu_relax();
-       __flush_tlb_all();
-#endif
-
-       /* This must be done before setting cpu_online_mask */
-       set_cpu_sibling_map(raw_smp_processor_id());
-       wmb();
-
         /*
          * We need to hold call_lock, so there is no inconsistency
          * between the time smp_call_function() determines number of
@@ -373,6 +387,19 @@ void __cpuinit smp_store_cpu_info(int id)
                 identify_secondary_cpu(c);
  }
  
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+       struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+       struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+       cpumask_set_cpu(cpu1, c2->llc_shared_map);
+       cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
  
  void __cpuinit set_cpu_sibling_map(int cpu)
  {
@@ -385,14 +412,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
                 for_each_cpu(i, cpu_sibling_setup_mask) {
                         struct cpuinfo_x86 *o = &cpu_data(i);
  
-                       if (c->phys_proc_id == o->phys_proc_id &&
-                           c->cpu_core_id == o->cpu_core_id) {
-                               cpumask_set_cpu(i, cpu_sibling_mask(cpu));
-                               cpumask_set_cpu(cpu, cpu_sibling_mask(i));
-                               cpumask_set_cpu(i, cpu_core_mask(cpu));
-                               cpumask_set_cpu(cpu, cpu_core_mask(i));
-                               cpumask_set_cpu(i, c->llc_shared_map);
-                               cpumask_set_cpu(cpu, o->llc_shared_map);
+                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+                               if (c->phys_proc_id == o->phys_proc_id &&
+                                   c->compute_unit_id == o->compute_unit_id)
+                                       link_thread_siblings(cpu, i);
+                       } else if (c->phys_proc_id == o->phys_proc_id &&
+                                  c->cpu_core_id == o->cpu_core_id) {
+                               link_thread_siblings(cpu, i);
                         }
                 }
         } else {
@@ -401,7 +427,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
  
         cpumask_set_cpu(cpu, c->llc_shared_map);
  
-       if (current_cpu_data.x86_max_cores == 1) {
+       if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
                 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
                 c->booted_cores = 1;
                 return;
@@ -686,7 +712,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
  static void __cpuinit announce_cpu(int cpu, int apicid)
  {
         static int current_node = -1;
-       int node = cpu_to_node(cpu);
+       int node = early_cpu_to_node(cpu);
  
         if (system_state == SYSTEM_BOOTING) {
                 if (node != current_node) {
@@ -718,7 +744,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
                 .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
         };
  
-       INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+       INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
  
         alternatives_smp_switch(1);
  
@@ -735,12 +761,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
                 goto do_rest;
         }
  
-       if (!keventd_up() || current_is_keventd())
-               c_idle.work.func(&c_idle.work);
-       else {
-               schedule_work(&c_idle.work);
-               wait_for_completion(&c_idle.done);
-       }
+       schedule_work(&c_idle.work);
+       wait_for_completion(&c_idle.done);
  
         if (IS_ERR(c_idle.idle)) {
                 printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +838,13 @@ do_rest:
                         if (cpumask_test_cpu(cpu, cpu_callin_mask))
                                 break;  /* It has booted */
                         udelay(100);
+                       /*
+                        * Allow other tasks to run while we wait for the
+                        * AP to come online. This also gives a chance
+                        * for the MTRR work(triggered by the AP coming online)
+                        * to be completed in the stop machine context.
+                        */
+                       schedule();
                 }
  
                 if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -894,20 +923,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
  
         per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  
-#ifdef CONFIG_X86_32
-       /* init low mem mapping */
-       clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
-               min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
-       flush_tlb_all();
-       low_mappings = 1;
-
-       err = do_boot_cpu(apicid, cpu);
-
-       zap_low_mappings(false);
-       low_mappings = 0;
-#else
         err = do_boot_cpu(apicid, cpu);
-#endif
         if (err) {
                 pr_debug("do_boot_cpu failed %d\n", err);
                 return -EIO;
@@ -1042,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
                 printk(KERN_INFO "SMP mode deactivated.\n");
                 smpboot_clear_io_apic();
  
-               localise_nmi_watchdog();
-
                 connect_bsp_APIC();
                 setup_local_APIC();
                 end_local_APIC_setup();
@@ -1075,7 +1089,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
  
         preempt_disable();
         smp_cpu_index_default();
-       current_cpu_data = boot_cpu_data;
+       memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
         cpumask_copy(cpu_callin_mask, cpumask_of(0));
         mb();
         /*
@@ -1093,8 +1107,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
         }
         set_cpu_sibling_map(0);
  
-       enable_IR_x2apic();
-       default_setup_apic_routing();
  
         if (smp_sanity_check(max_cpus) < 0) {
                 printk(KERN_INFO "SMP disabled\n");
@@ -1102,6 +1114,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
                 goto out;
         }
  
+       default_setup_apic_routing();
+
         preempt_disable();
         if (read_apic_id() != boot_cpu_physical_apicid) {
                 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1147,6 +1161,20 @@ out:
         preempt_enable();
  }
  
+void arch_disable_nonboot_cpus_begin(void)
+{
+       /*
+        * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+        * In the suspend path, we will be back in the SMP mode shortly anyways.
+        */
+       skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+       skip_smp_alternatives = false;
+}
+
  void arch_enable_nonboot_cpus_begin(void)
  {
         set_mtrr_aps_delayed_init();
@@ -1177,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
  #ifdef CONFIG_X86_IO_APIC
         setup_ioapic_dest();
  #endif
-       check_nmi_watchdog();
         mtrr_aps_init();
  }
  
@@ -1322,8 +1349,6 @@ int native_cpu_disable(void)
         if (cpu == 0)
                 return -EBUSY;
  
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
         clear_local_APIC();
  
         cpu_disable_common();
@@ -1354,12 +1379,11 @@ void play_dead_common(void)
  {
         idle_task_exit();
         reset_lazy_tlbstate();
-       irq_ctx_exit(raw_smp_processor_id());
         c1e_remove_cpu(raw_smp_processor_id());
  
         mb();
         /* Ack it */
-       __get_cpu_var(cpu_state) = CPU_DEAD;
+       __this_cpu_write(cpu_state, CPU_DEAD);
  
         /*
          * With physical CPU hotplug, we should halt the cpu
@@ -1367,11 +1391,88 @@ void play_dead_common(void)
         local_irq_disable();
  }
  
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       unsigned int highest_cstate = 0;
+       unsigned int highest_subcstate = 0;
+       int i;
+       void *mwait_ptr;
+
+       if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT))
+               return;
+       if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+               return;
+       if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
+               return;
+
+       eax = CPUID_MWAIT_LEAF;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       /*
+        * eax will be 0 if EDX enumeration is not valid.
+        * Initialized below to cstate, sub_cstate value when EDX is valid.
+        */
+       if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+               eax = 0;
+       } else {
+               edx >>= MWAIT_SUBSTATE_SIZE;
+               for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+                       if (edx & MWAIT_SUBSTATE_MASK) {
+                               highest_cstate = i;
+                               highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+                       }
+               }
+               eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+                       (highest_subcstate - 1);
+       }
+
+       /*
+        * This should be a memory location in a cache line which is
+        * unlikely to be touched by other processors.  The actual
+        * content is immaterial as it is not actually modified in any way.
+        */
+       mwait_ptr = &current_thread_info()->flags;
+
+       wbinvd();
+
+       while (1) {
+               /*
+                * The CLFLUSH is a workaround for erratum AAI65 for
+                * the Xeon 7400 series.  It's not clear it is actually
+                * needed, but it should be harmless in either case.
+                * The WBINVD is insufficient due to the spurious-wakeup
+                * case where we return around the loop.
+                */
+               clflush(mwait_ptr);
+               __monitor(mwait_ptr, 0, 0);
+               mb();
+               __mwait(eax, 0);
+       }
+}
+
+static inline void hlt_play_dead(void)
+{
+       if (__this_cpu_read(cpu_info.x86) >= 4)
+               wbinvd();
+
+       while (1) {
+               native_halt();
+       }
+}
+
  void native_play_dead(void)
  {
         play_dead_common();
         tboot_shutdown(TB_SHUTDOWN_WFS);
-       wbinvd_halt();
+
+       mwait_play_dead();      /* Only returns on failure */
+       hlt_play_dead();
  }
  
  #else /* ... !CONFIG_HOTPLUG_CPU */