Merge commit 'v2.6.29-rc7' into perfcounters/core
[linux-2.6.git] / kernel / sched.c
index 4007561..78f4424 100644 (file)
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
        ktime_t now;
 
-       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
 
        if (hrtimer_active(&rt_b->rt_period_timer))
@@ -558,6 +558,7 @@ struct rq {
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
+       u64 nr_migrations_in;
 
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -668,7 +669,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -979,6 +980,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       local_irq_save(*flags);
+       rq = cpu_rq(smp_processor_id());
+       spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+       __releases(rq->lock)
+{
+       struct rq *rq;
+
+       rq = cpu_rq(smp_processor_id());
+       spin_unlock(&rq->lock);
+       local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
@@ -1885,12 +1906,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
+#endif
        if (old_cpu != new_cpu) {
-               schedstat_inc(p, se.nr_migrations);
+               p->se.nr_migrations++;
+               new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
-       }
 #endif
+       }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
 
@@ -2242,6 +2266,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:         the task to evaluate
+ * @func:      the function to be called
+ * @info:      the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                             void (*func) (void *info), void *info)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if (task_curr(p))
+               smp_call_function_single(cpu, func, info, 1);
+       preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2266,16 +2311,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
 
-       if (!sync) {
-               if (current->se.avg_overlap < sysctl_sched_migration_cost &&
-                         p->se.avg_overlap < sysctl_sched_migration_cost)
-                       sync = 1;
-       } else {
-               if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
-                         p->se.avg_overlap >= sysctl_sched_migration_cost)
-                       sync = 0;
-       }
-
 #ifdef CONFIG_SMP
        if (sched_feat(LB_WAKEUP_UPDATE)) {
                struct sched_domain *sd;
@@ -2394,6 +2429,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
 
@@ -2614,6 +2650,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+       perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (current->sched_class->post_schedule)
@@ -2776,6 +2813,21 @@ unsigned long nr_active(void)
 }
 
 /*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+       return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+       return cpu_rq(cpu)->nr_migrations_in;
+}
+
+/*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
@@ -3890,19 +3942,24 @@ int select_nohz_load_balancer(int stop_tick)
        int cpu = smp_processor_id();
 
        if (stop_tick) {
-               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
 
-               /*
-                * If we are going offline and still the leader, give up!
-                */
-               if (!cpu_active(cpu) &&
-                   atomic_read(&nohz.load_balancer) == cpu) {
+               if (!cpu_active(cpu)) {
+                       if (atomic_read(&nohz.load_balancer) != cpu)
+                               return 0;
+
+                       /*
+                        * If we are going offline and still the leader,
+                        * give up!
+                        */
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
+
                        return 0;
                }
 
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
+
                /* time for ilb owner also to sleep */
                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4142,6 +4199,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+       s64 delta_exec;
+       struct rq *rq;
+
+       rq = task_rq(p);
+       WARN_ON_ONCE(!runqueue_is_locked());
+       WARN_ON_ONCE(!task_current(rq, p));
+
+       if (update)
+               update_rq_clock(rq);
+
+       delta_exec = rq->clock - p->se.exec_start;
+
+       WARN_ON_ONCE(delta_exec < 0);
+
+       return delta_exec;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4401,6 +4481,7 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+       perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -4596,6 +4677,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, cpu);
 
                rq->nr_switches++;
                rq->curr = next;
@@ -4697,8 +4779,8 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                            int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
 
@@ -6944,20 +7026,26 @@ static void free_rootdomain(struct root_domain *rd)
 
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
+       struct root_domain *old_rd = NULL;
        unsigned long flags;
 
        spin_lock_irqsave(&rq->lock, flags);
 
        if (rq->rd) {
-               struct root_domain *old_rd = rq->rd;
+               old_rd = rq->rd;
 
                if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
 
                cpumask_clear_cpu(rq->cpu, old_rd->span);
 
-               if (atomic_dec_and_test(&old_rd->refcount))
-                       free_rootdomain(old_rd);
+               /*
+                * If we dont want to free the old_rt yet then
+                * set old_rd to NULL to skip the freeing later
+                * in this function:
+                */
+               if (!atomic_dec_and_test(&old_rd->refcount))
+                       old_rd = NULL;
        }
 
        atomic_inc(&rd->refcount);
@@ -6968,6 +7056,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
+
+       if (old_rd)
+               free_rootdomain(old_rd);
 }
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -9215,6 +9306,16 @@ static int sched_rt_global_constraints(void)
 
        return ret;
 }
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+       /* Don't accept realtime tasks when there is no way for them to run */
+       if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+               return 0;
+
+       return 1;
+}
+
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9308,8 +9409,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-       /* Don't accept realtime tasks when there is no way for them to run */
-       if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                return -EINVAL;
 #else
        /* We don't support RT-tasks being in separate groups */