Remove obsolete #include <linux/config.h>
[linux-2.6.git] / arch / i386 / kernel / smpboot.c
index 47ec767..6f5fea0 100644 (file)
@@ -34,7 +34,6 @@
 *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process. */
 
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 
@@ -52,6 +51,7 @@
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
+#include <asm/nmi.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -66,15 +66,14 @@ int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 #endif
 
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
 
+/* representing HT and core siblings of each logical CPU */
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
@@ -85,11 +84,7 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-#ifdef CONFIG_HOTPLUG_CPU
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-#else
 cpumask_t cpu_possible_map;
-#endif
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
@@ -256,7 +251,7 @@ static void __init synchronize_tsc_bp (void)
                 * all APs synchronize but they loop on '== num_cpus'
                 */
                while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-                       mb();
+                       cpu_relax();
                atomic_set(&tsc_count_stop, 0);
                wmb();
                /*
@@ -275,7 +270,7 @@ static void __init synchronize_tsc_bp (void)
                 * Wait for all APs to leave the synchronization point:
                 */
                while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-                       mb();
+                       cpu_relax();
                atomic_set(&tsc_count_start, 0);
                wmb();
                atomic_inc(&tsc_count_stop);
@@ -312,7 +307,9 @@ static void __init synchronize_tsc_bp (void)
                        if (tsc_values[i] < avg)
                                realdelta = -realdelta;
 
-                       printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
+                       if (realdelta > 0)
+                               printk(KERN_INFO "CPU#%d had %ld usecs TSC "
+                                       "skew, fixed it up.\n", i, realdelta);
                }
 
                sum += delta;
@@ -330,19 +327,21 @@ static void __init synchronize_tsc_ap (void)
         * this gets called, so we first wait for the BP to
         * finish SMP initialization:
         */
-       while (!atomic_read(&tsc_start_flag)) mb();
+       while (!atomic_read(&tsc_start_flag))
+               cpu_relax();
 
        for (i = 0; i < NR_LOOPS; i++) {
                atomic_inc(&tsc_count_start);
                while (atomic_read(&tsc_count_start) != num_booting_cpus())
-                       mb();
+                       cpu_relax();
 
                rdtscll(tsc_values[smp_processor_id()]);
                if (i == NR_LOOPS-1)
                        write_tsc(0, 0);
 
                atomic_inc(&tsc_count_stop);
-               while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+               while (atomic_read(&tsc_count_stop) != num_booting_cpus())
+                       cpu_relax();
        }
 }
 #undef NR_LOOPS
@@ -442,35 +441,83 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * And for power savings, we return cpu_core_map
+        */
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
 static inline void
 set_cpu_sibling_map(int cpu)
 {
        int i;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       cpu_set(cpu, cpu_sibling_setup_map);
 
        if (smp_num_siblings > 1) {
-               for (i = 0; i < NR_CPUS; i++) {
-                       if (!cpu_isset(i, cpu_callout_map))
-                               continue;
-                       if (cpu_core_id[cpu] == cpu_core_id[i]) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
                                cpu_set(i, cpu_sibling_map[cpu]);
                                cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
                        }
                }
        } else {
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
 
-       if (current_cpu_data.x86_num_cores > 1) {
-               for (i = 0; i < NR_CPUS; i++) {
-                       if (!cpu_isset(i, cpu_callout_map))
-                               continue;
-                       if (phys_proc_id[cpu] == phys_proc_id[i]) {
-                               cpu_set(i, cpu_core_map[cpu]);
-                               cpu_set(cpu, cpu_core_map[i]);
-                       }
-               }
-       } else {
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
+       if (current_cpu_data.x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
+               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
        }
 }
 
@@ -485,6 +532,7 @@ static void __devinit start_secondary(void *unused)
         * things done here to the most necessary things.
         */
        cpu_init();
+       preempt_disable();
        smp_callin();
        while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
                rep_nop();
@@ -847,8 +895,7 @@ static inline struct task_struct * alloc_idle_task(int cpu)
                /* initialize thread_struct.  we really want to avoid destroy
                 * idle tread
                 */
-               idle->thread.esp = (unsigned long)(((struct pt_regs *)
-                       (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+               idle->thread.esp = (unsigned long)task_pt_regs(idle);
                init_idle(idle, cpu);
                return idle;
        }
@@ -876,6 +923,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
        unsigned short nmi_high = 0, nmi_low = 0;
 
        ++cpucount;
+       alternatives_smp_switch(1);
 
        /*
         * We can't use kernel_thread since we must avoid to
@@ -979,7 +1027,6 @@ void cpu_exit_clear(void)
 
        cpu_clear(cpu, cpu_callout_map);
        cpu_clear(cpu, cpu_callin_map);
-       cpu_clear(cpu, cpu_present_map);
 
        cpu_clear(cpu, smp_commenced_mask);
        unmap_cpu_to_logical_apicid(cpu);
@@ -991,27 +1038,39 @@ struct warm_boot_cpu_info {
        int cpu;
 };
 
-static void __devinit do_warm_boot_cpu(void *p)
+static void __cpuinit do_warm_boot_cpu(void *p)
 {
        struct warm_boot_cpu_info *info = p;
        do_boot_cpu(info->apicid, info->cpu);
        complete(info->complete);
 }
 
-int __devinit smp_prepare_cpu(int cpu)
+static int __cpuinit __smp_prepare_cpu(int cpu)
 {
        DECLARE_COMPLETION(done);
        struct warm_boot_cpu_info info;
        struct work_struct task;
        int     apicid, ret;
+       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
-       lock_cpu_hotplug();
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
                ret = -ENODEV;
                goto exit;
        }
 
+       /*
+        * the CPU isn't initialized at boot time, allocate gdt table here.
+        * cpu_init will initialize it
+        */
+       if (!cpu_gdt_descr->address) {
+               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+               if (!cpu_gdt_descr->address)
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       ret = -ENOMEM;
+                       goto exit;
+       }
+
        info.complete = &done;
        info.apicid = apicid;
        info.cpu = cpu;
@@ -1030,7 +1089,6 @@ int __devinit smp_prepare_cpu(int cpu)
        zap_low_mappings();
        ret = 0;
 exit:
-       unlock_cpu_hotplug();
        return ret;
 }
 #endif
@@ -1062,6 +1120,7 @@ static void smp_tune_scheduling (void)
                        cachesize = 16; /* Pentiums, 2x8kB cache */
                        bandwidth = 100;
                }
+               max_cache_size = cachesize * 1024;
        }
 }
 
@@ -1094,11 +1153,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
 
        current_thread_info()->cpu = 0;
        smp_tune_scheduling();
-       cpus_clear(cpu_sibling_map[0]);
-       cpu_set(0, cpu_sibling_map[0]);
 
-       cpus_clear(cpu_core_map[0]);
-       cpu_set(0, cpu_core_map[0]);
+       set_cpu_sibling_map(0);
 
        /*
         * If we couldn't find an SMP configuration at boot time,
@@ -1277,15 +1333,24 @@ static void
 remove_siblinginfo(int cpu)
 {
        int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
 
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
+       }
+                       
        for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
                cpu_clear(cpu, cpu_sibling_map[sibling]);
-       for_each_cpu_mask(sibling, cpu_core_map[cpu])
-               cpu_clear(cpu, cpu_core_map[sibling]);
        cpus_clear(cpu_sibling_map[cpu]);
        cpus_clear(cpu_core_map[cpu]);
-       phys_proc_id[cpu] = BAD_APICID;
-       cpu_core_id[cpu] = BAD_APICID;
+       c[cpu].phys_proc_id = 0;
+       c[cpu].cpu_core_id = 0;
+       cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
 int __cpu_disable(void)
@@ -1304,8 +1369,7 @@ int __cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
-       /* We enable the timer again on the exit path of the death loop */
-       disable_APIC_timer();
+       clear_local_APIC();
        /* Allow any queued timer interrupts to get serviced */
        local_irq_enable();
        mdelay(1);
@@ -1329,6 +1393,8 @@ void __cpu_die(unsigned int cpu)
                /* They ack this in play_dead by setting CPU_DEAD */
                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
                        printk ("CPU %d is now offline\n", cpu);
+                       if (1 == num_online_cpus())
+                               alternatives_smp_switch(0);
                        return;
                }
                msleep(100);
@@ -1350,6 +1416,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+       int ret=0;
+
+       /*
+        * We do warm boot only on cpus that had booted earlier
+        * Otherwise cold boot is all handled from smp_boot_cpus().
+        * cpu_callin_map is set during AP kickstart process. Its reset
+        * when a cpu is taken offline from cpu_exit_clear().
+        */
+       if (!cpu_isset(cpu, cpu_callin_map))
+               ret = __smp_prepare_cpu(cpu);
+
+       if (ret)
+               return -EIO;
+#endif
+
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
                printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
@@ -1362,7 +1444,7 @@ int __devinit __cpu_up(unsigned int cpu)
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
        while (!cpu_isset(cpu, cpu_online_map))
-               mb();
+               cpu_relax();
        return 0;
 }