TEMP: stop_machine: serialize disable IRQ sequence
Prafull Suryawanshi [Mon, 2 Nov 2015 09:09:02 +0000 (14:09 +0530)]
bug 200044022

On T124 platforms, we hit a hang on cpu_down(cpu=3) where
CPU3 gets stuck waiting on the gic irq_controller_lock in
gic_eoi_irq. The other CPUs have already entered the
DISABLE_IRQ stop machine state at this point.
This therefore causes the watchdog to timeout and the
system to reset. Given that we have the stopper thread
scheduled (preemption disabled), the 'hang scenario' most
likely manifests itself due to some sort of issue in
the irq context. Sequentially entering the DISABLE_IRQ
state from CPU3->CPU0 seem to somehow prevent this hang.

This change is only meant to be a temporary workaround
to improve system stability and is by no means a fix
of any sort. To that end, it is wrapped in a CONFIG
option that is turned off by default.

Change-Id: Ic50ecd5a6d429706e4f68bcd133707fae4e692ce
Signed-off-by: Prafull Suryawanshi <prafulls@nvidia.com>
Reviewed-on: http://git-master/r/836921
Reviewed-by: Dhiren Parmar <dparmar@nvidia.com>
Tested-by: Dhiren Parmar <dparmar@nvidia.com>

arch/arm/mach-tegra/Kconfig
kernel/stop_machine.c

index 7157e98..00e0389 100644 (file)
@@ -352,6 +352,17 @@ config TEGRA_CPUQUIET
          high/low power CPU clusters automatically, corresponding to
          CPU frequency scaling.
 
+config TEGRA_SERIALIZE_DISABLE_IRQ
+       bool "controls disable irq sequence"
+       depends on ARCH_TEGRA_12x_SOC && !ARCH_TEGRA_13x_SOC
+       default y
+       help
+         Need to serialize cpu irq disable sequence in
+         stop_machine() primitive. On T124, it is found
+         that we hit hang when all 4 cpu's parallel disable
+         irq. This symbol is used as WAR to avoid such
+         scenario on T124.
+
 config TEGRA_MC_EARLY_ACK
        bool "Enable early acknowledgement from mermory controller"
        depends on ARCH_TEGRA_3x_SOC || ARCH_TEGRA_11x_SOC
index c09f295..67cc0df 100644 (file)
@@ -366,7 +366,14 @@ enum stopmachine_state {
        /* Awaiting everyone to be scheduled. */
        STOPMACHINE_PREPARE,
        /* Disable interrupts. */
+#ifdef CONFIG_TEGRA_SERIALIZE_DISABLE_IRQ
+       STOPMACHINE_DISABLE_IRQ_3,
+       STOPMACHINE_DISABLE_IRQ_2,
+       STOPMACHINE_DISABLE_IRQ_1,
+       STOPMACHINE_DISABLE_IRQ_0,
+#else
        STOPMACHINE_DISABLE_IRQ,
+#endif
        /* Run the function */
        STOPMACHINE_RUN,
        /* Exit */
@@ -427,10 +434,37 @@ static int stop_machine_cpu_stop(void *data)
                if (smdata->state != curstate) {
                        curstate = smdata->state;
                        switch (curstate) {
+#ifdef CONFIG_TEGRA_SERIALIZE_DISABLE_IRQ
+                       case STOPMACHINE_DISABLE_IRQ_3:
+                               if (cpu == 3) {
+                                       local_irq_disable();
+                                       hard_irq_disable();
+                               }
+                               break;
+                       case STOPMACHINE_DISABLE_IRQ_2:
+                               if (cpu == 2) {
+                                       local_irq_disable();
+                                       hard_irq_disable();
+                               }
+                               break;
+                       case STOPMACHINE_DISABLE_IRQ_1:
+                               if (cpu == 1) {
+                                       local_irq_disable();
+                                       hard_irq_disable();
+                               }
+                               break;
+                       case STOPMACHINE_DISABLE_IRQ_0:
+                               if (cpu == 0) {
+                                       local_irq_disable();
+                                       hard_irq_disable();
+                               }
+                               break;
+#else
                        case STOPMACHINE_DISABLE_IRQ:
                                local_irq_disable();
                                hard_irq_disable();
                                break;
+#endif
                        case STOPMACHINE_RUN:
                                if (is_active)
                                        err = smdata->fn(smdata->data);