Merge commit 'linus/master' into merge-linus
[linux-2.6.git] / kernel / time / tick-sched.c
index 9234e44..a547be1 100644 (file)
@@ -9,7 +9,7 @@
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *  Distribute under GPLv2.
  */
 #include <linux/cpu.h>
 #include <linux/err.h>
@@ -20,6 +20,9 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
+#include <linux/module.h>
+
+#include <asm/irq_regs.h>
 
 #include "tick-internal.h"
 
@@ -46,6 +49,13 @@ static void tick_do_update_jiffies64(ktime_t now)
        unsigned long ticks = 0;
        ktime_t delta;
 
+       /*
+        * Do a quick check without holding xtime_lock:
+        */
+       delta = ktime_sub(now, last_jiffies_update);
+       if (delta.tv64 < tick_period.tv64)
+               return;
+
        /* Reevalute with xtime_lock held */
        write_seqlock(&xtime_lock);
 
@@ -66,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
                                                           incr * ticks);
                }
                do_timer(++ticks);
+
+               /* Keep the tick_next_period variable up to date */
+               tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
        write_sequnlock(&xtime_lock);
 }
@@ -133,12 +146,63 @@ void tick_nohz_update_jiffies(void)
 
        cpu_clear(cpu, nohz_cpu_mask);
        now = ktime_get();
+       ts->idle_waketime = now;
 
        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);
+
+       touch_softlockup_watchdog();
+}
+
+void tick_nohz_stop_idle(int cpu)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+       if (ts->idle_active) {
+               ktime_t now, delta;
+               now = ktime_get();
+               delta = ktime_sub(now, ts->idle_entrytime);
+               ts->idle_lastupdate = now;
+               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+               ts->idle_active = 0;
+
+               sched_clock_idle_wakeup_event(0);
+       }
+}
+
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+{
+       ktime_t now, delta;
+
+       now = ktime_get();
+       if (ts->idle_active) {
+               delta = ktime_sub(now, ts->idle_entrytime);
+               ts->idle_lastupdate = now;
+               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+       }
+       ts->idle_entrytime = now;
+       ts->idle_active = 1;
+       sched_clock_idle_sleep_event();
+       return now;
 }
 
+u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
+{
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+       if (!tick_nohz_enabled)
+               return -1;
+
+       if (ts->idle_active)
+               *last_update_time = ktime_to_us(ts->idle_lastupdate);
+       else
+               *last_update_time = ktime_to_us(ktime_get());
+
+       return ktime_to_us(ts->idle_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
@@ -146,40 +210,55 @@ void tick_nohz_update_jiffies(void)
  * Called either from the idle loop or from irq_exit() when an idle period was
  * just interrupted by an interrupt which did not cause a reschedule.
  */
-void tick_nohz_stop_sched_tick(void)
+void tick_nohz_stop_sched_tick(int inidle)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
        struct tick_sched *ts;
-       ktime_t last_update, expires, now, delta;
+       ktime_t last_update, expires, now;
+       struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        int cpu;
 
        local_irq_save(flags);
 
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
+       now = tick_nohz_start_idle(ts);
+
+       /*
+        * If this cpu is offline and it is the one which updates
+        * jiffies, then give up the assignment and let it be taken by
+        * the cpu which runs the tick timer next. If we don't drop
+        * this here the jiffies might be stale and do_timer() never
+        * invoked.
+        */
+       if (unlikely(!cpu_online(cpu))) {
+               if (cpu == tick_do_timer_cpu)
+                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+       }
 
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
                goto end;
 
+       if (!inidle && !ts->inidle)
+               goto end;
+
+       ts->inidle = 1;
+
        if (need_resched())
                goto end;
 
-       cpu = smp_processor_id();
-       BUG_ON(local_softirq_pending());
+       if (unlikely(local_softirq_pending())) {
+               static int ratelimit;
 
-       now = ktime_get();
-       /*
-        * When called from irq_exit we need to account the idle sleep time
-        * correctly.
-        */
-       if (ts->tick_stopped) {
-               delta = ktime_sub(now, ts->idle_entrytime);
-               ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+               if (ratelimit < 10) {
+                       printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                              local_softirq_pending());
+                       ratelimit++;
+               }
+               goto end;
        }
 
-       ts->idle_entrytime = now;
        ts->idle_calls++;
-
        /* Read jiffies and the time when jiffies were updated last */
        do {
                seq = read_seqbegin(&xtime_lock);
@@ -191,7 +270,7 @@ void tick_nohz_stop_sched_tick(void)
        next_jiffies = get_next_timer_interrupt(last_jiffies);
        delta_jiffies = next_jiffies - last_jiffies;
 
-       if (rcu_needs_cpu(cpu))
+       if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
        /*
         * Do not stop the tick, if we are only one off
@@ -213,10 +292,46 @@ void tick_nohz_stop_sched_tick(void)
                 * the scheduler tick in nohz_restart_sched_tick.
                 */
                if (!ts->tick_stopped) {
-                       ts->idle_tick = ts->sched_timer.expires;
+                       if (select_nohz_load_balancer(1)) {
+                               /*
+                                * sched tick not stopped!
+                                */
+                               cpu_clear(cpu, nohz_cpu_mask);
+                               goto out;
+                       }
+
+                       ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
+                       rcu_enter_nohz();
+               }
+
+               /*
+                * If this cpu is the one which updates jiffies, then
+                * give up the assignment and let it be taken by the
+                * cpu which runs the tick timer next, which might be
+                * this cpu as well. If we don't drop this here the
+                * jiffies might be stale and do_timer() never
+                * invoked.
+                */
+               if (cpu == tick_do_timer_cpu)
+                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+
+               ts->idle_sleeps++;
+
+               /*
+                * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                * there is no timer pending or at least extremly far
+                * into the future (12 days for HZ=1000). In this case
+                * we simply stop the tick timer:
+                */
+               if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                       ts->idle_expires.tv64 = KTIME_MAX;
+                       if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+                               hrtimer_cancel(&ts->sched_timer);
+                       goto out;
                }
+
                /*
                 * calculate the expiry time for the next timer wheel
                 * timer
@@ -224,7 +339,6 @@ void tick_nohz_stop_sched_tick(void)
                expires = ktime_add_ns(last_update, tick_period.tv64 *
                                       delta_jiffies);
                ts->idle_expires = expires;
-               ts->idle_sleeps++;
 
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
@@ -232,7 +346,7 @@ void tick_nohz_stop_sched_tick(void)
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                goto out;
-               } else if(!tick_program_event(expires, 0))
+               } else if (!tick_program_event(expires, 0))
                                goto out;
                /*
                 * We are past the event already. So we crossed a
@@ -246,12 +360,25 @@ void tick_nohz_stop_sched_tick(void)
 out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
+       ts->sleep_length = ktime_sub(dev->next_event, now);
 end:
        local_irq_restore(flags);
 }
 
 /**
- * nohz_restart_sched_tick - restart the idle tick from the idle task
+ * tick_nohz_get_sleep_length - return the length of the current sleep
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_sleep_length(void)
+{
+       struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+       return ts->sleep_length;
+}
+
+/**
+ * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
  *
  * Restart the idle tick when the CPU is woken up from idle
  */
@@ -260,22 +387,27 @@ void tick_nohz_restart_sched_tick(void)
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long ticks;
-       ktime_t now, delta;
+       ktime_t now;
 
-       if (!ts->tick_stopped)
+       local_irq_disable();
+       tick_nohz_stop_idle(cpu);
+
+       if (!ts->inidle || !ts->tick_stopped) {
+               ts->inidle = 0;
+               local_irq_enable();
                return;
+       }
+
+       ts->inidle = 0;
+
+       rcu_exit_nohz();
 
        /* Update jiffies first */
+       select_nohz_load_balancer(0);
        now = ktime_get();
-
-       local_irq_disable();
        tick_do_update_jiffies64(now);
        cpu_clear(cpu, nohz_cpu_mask);
 
-       /* Account the idle time */
-       delta = ktime_sub(now, ts->idle_entrytime);
-       ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-
        /*
         * We stopped the tick in idle. Update process times would miss the
         * time we slept as update_process_times does only a 1 tick
@@ -292,26 +424,28 @@ void tick_nohz_restart_sched_tick(void)
                sub_preempt_count(HARDIRQ_OFFSET);
        }
 
+       touch_softlockup_watchdog();
        /*
         * Cancel the scheduled timer and restore the tick
         */
        ts->tick_stopped  = 0;
+       ts->idle_exittime = now;
        hrtimer_cancel(&ts->sched_timer);
-       ts->sched_timer.expires = ts->idle_tick;
+       hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
 
        while (1) {
                /* Forward the time to expire in the future */
                hrtimer_forward(&ts->sched_timer, now, tick_period);
 
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-                       hrtimer_start(&ts->sched_timer,
-                                     ts->sched_timer.expires,
+                       hrtimer_start_expires(&ts->sched_timer,
                                      HRTIMER_MODE_ABS);
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                break;
                } else {
-                       if (!tick_program_event(ts->sched_timer.expires, 0))
+                       if (!tick_program_event(
+                               hrtimer_get_expires(&ts->sched_timer), 0))
                                break;
                }
                /* Update jiffies and reread time */
@@ -324,7 +458,7 @@ void tick_nohz_restart_sched_tick(void)
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {
        hrtimer_forward(&ts->sched_timer, now, tick_period);
-       return tick_program_event(ts->sched_timer.expires, 0);
+       return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
 }
 
 /*
@@ -334,12 +468,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        struct pt_regs *regs = get_irq_regs();
+       int cpu = smp_processor_id();
        ktime_t now = ktime_get();
 
        dev->next_event.tv64 = KTIME_MAX;
 
+       /*
+        * Check if the do_timer duty was dropped. We don't care about
+        * concurrency: This happens only when the cpu in charge went
+        * into a long sleep. If two cpus happen to assign themself to
+        * this duty, then the jiffies update is still serialized by
+        * xtime_lock.
+        */
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+               tick_do_timer_cpu = cpu;
+
        /* Check, if the jiffies need an update */
-       tick_do_update_jiffies64(now);
+       if (tick_do_timer_cpu == cpu)
+               tick_do_update_jiffies64(now);
 
        /*
         * When we are idle and the tick is stopped, we have to touch
@@ -395,7 +541,7 @@ static void tick_nohz_switch_to_nohz(void)
        next = tick_init_jiffy_update();
 
        for (;;) {
-               ts->sched_timer.expires = next;
+               hrtimer_set_expires(&ts->sched_timer, next);
                if (!tick_program_event(next, 0))
                        break;
                next = ktime_add(next, tick_period);
@@ -417,19 +563,32 @@ static inline void tick_nohz_switch_to_nohz(void) { }
  */
 #ifdef CONFIG_HIGH_RES_TIMERS
 /*
- * We rearm the timer until we get disabled by the idle code
+ * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled and timer->base->cpu_base->lock held.
  */
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
        struct tick_sched *ts =
                container_of(timer, struct tick_sched, sched_timer);
-       struct hrtimer_cpu_base *base = timer->base->cpu_base;
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();
+       int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+       /*
+        * Check if the do_timer duty was dropped. We don't care about
+        * concurrency: This happens only when the cpu in charge went
+        * into a long sleep. If two cpus happen to assign themself to
+        * this duty, then the jiffies update is still serialized by
+        * xtime_lock.
+        */
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+               tick_do_timer_cpu = cpu;
+#endif
 
        /* Check, if the jiffies need an update */
-       tick_do_update_jiffies64(now);
+       if (tick_do_timer_cpu == cpu)
+               tick_do_update_jiffies64(now);
 
        /*
         * Do not call, when we are not in irq context and have
@@ -448,15 +607,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                        touch_softlockup_watchdog();
                        ts->idle_jiffies++;
                }
-               /*
-                * update_process_times() might take tasklist_lock, hence
-                * drop the base lock. sched-tick hrtimers are per-CPU and
-                * never accessible by userspace APIs, so this is safe to do.
-                */
-               spin_unlock(&base->lock);
                update_process_times(user_mode(regs));
                profile_tick(CPU_PROFILING);
-               spin_lock(&base->lock);
        }
 
        /* Do not restart, when we are in the idle loop */
@@ -475,21 +627,25 @@ void tick_setup_sched_timer(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t now = ktime_get();
+       u64 offset;
 
        /*
         * Emulate tick processing via per-CPU hrtimers:
         */
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        ts->sched_timer.function = tick_sched_timer;
-       ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
-       /* Get the next period */
-       ts->sched_timer.expires = tick_init_jiffy_update();
+       /* Get the next period (per cpu) */
+       hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+       offset = ktime_to_ns(tick_period) >> 1;
+       do_div(offset, num_possible_cpus());
+       offset *= smp_processor_id();
+       hrtimer_add_expires_ns(&ts->sched_timer, offset);
 
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
-               hrtimer_start(&ts->sched_timer, ts->sched_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
                /* Check, if the timer was already in the past */
                if (hrtimer_active(&ts->sched_timer))
                        break;
@@ -501,17 +657,21 @@ void tick_setup_sched_timer(void)
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
+#endif /* HIGH_RES_TIMERS */
 
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
+# ifdef CONFIG_HIGH_RES_TIMERS
        if (ts->sched_timer.base)
                hrtimer_cancel(&ts->sched_timer);
-       ts->tick_stopped = 0;
+# endif
+
        ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
-#endif /* HIGH_RES_TIMERS */
+#endif
 
 /**
  * Async notification about clocksource changes
@@ -552,7 +712,7 @@ int tick_check_oneshot_change(int allow_nohz)
        if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
                return 0;
 
-       if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+       if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;
 
        if (!allow_nohz)