perfcounters: fix task clock counter
Ingo Molnar [Wed, 17 Dec 2008 13:10:57 +0000 (14:10 +0100)]
Impact: fix per task clock counter precision

Signed-off-by: Ingo Molnar <mingo@elte.hu>

include/linux/kernel_stat.h
kernel/exit.c
kernel/perf_counter.c
kernel/sched.c

index 4a145ca..1b2e324 100644 (file)
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
        return sum;
 }
 
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
+
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
index d336c90..244edfd 100644 (file)
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        forget_original_parent(tsk);
        exit_task_namespaces(tsk);
 
+       /*
+        * Flush inherited counters to the parent - before the parent
+        * gets woken up by child-exit notifications.
+        */
+       perf_counter_exit_task(tsk);
+
        write_lock_irq(&tasklist_lock);
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
        mpol_put(tsk->mempolicy);
        tsk->mempolicy = NULL;
 #endif
-       /*
-        * These must happen late, after the PID is not
-        * hashed anymore, but still at a point that may sleep:
-        */
-       perf_counter_exit_task(tsk);
 #ifdef CONFIG_FUTEX
        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
        if (tsk->splice_pipe)
                __free_pipe_info(tsk->splice_pipe);
 
+       /*
+        * These must happen late, after the PID is not
+        * hashed anymore, but still at a point that may sleep:
+        */
+       perf_counter_exit_task(tsk);
+
        preempt_disable();
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
index 961d651..f1110ac 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
 
 /*
@@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
 
        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
                counter->hw_ops->disable(counter);
@@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info)
                            perf_max_counters - perf_reserved_percpu);
        }
 
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
 }
 
 
@@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
 
-       spin_lock_irqsave(&ctx->lock, flags);
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
 
        /*
         * Protect the list operation against NMI by disabling the
@@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info)
        if (!ctx->task && cpuctx->max_pertask)
                cpuctx->max_pertask--;
 
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
 }
 
 /*
@@ -438,15 +443,19 @@ int perf_counter_task_disable(void)
        struct task_struct *curr = current;
        struct perf_counter_context *ctx = &curr->perf_counter_ctx;
        struct perf_counter *counter;
+       unsigned long flags;
        u64 perf_flags;
        int cpu;
 
        if (likely(!ctx->nr_counters))
                return 0;
 
-       local_irq_disable();
+       curr_rq_lock_irq_save(&flags);
        cpu = smp_processor_id();
 
+       /* force the update of the task clock: */
+       __task_delta_exec(curr, 1);
+
        perf_counter_task_sched_out(curr, cpu);
 
        spin_lock(&ctx->lock);
@@ -463,7 +472,7 @@ int perf_counter_task_disable(void)
 
        spin_unlock(&ctx->lock);
 
-       local_irq_enable();
+       curr_rq_unlock_irq_restore(&flags);
 
        return 0;
 }
@@ -473,15 +482,19 @@ int perf_counter_task_enable(void)
        struct task_struct *curr = current;
        struct perf_counter_context *ctx = &curr->perf_counter_ctx;
        struct perf_counter *counter;
+       unsigned long flags;
        u64 perf_flags;
        int cpu;
 
        if (likely(!ctx->nr_counters))
                return 0;
 
-       local_irq_disable();
+       curr_rq_lock_irq_save(&flags);
        cpu = smp_processor_id();
 
+       /* force the update of the task clock: */
+       __task_delta_exec(curr, 1);
+
        spin_lock(&ctx->lock);
 
        /*
@@ -493,6 +506,7 @@ int perf_counter_task_enable(void)
                if (counter->state != PERF_COUNTER_STATE_OFF)
                        continue;
                counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->hw_event.disabled = 0;
        }
        hw_perf_restore(perf_flags);
 
@@ -500,7 +514,7 @@ int perf_counter_task_enable(void)
 
        perf_counter_task_sched_in(curr, cpu);
 
-       local_irq_enable();
+       curr_rq_unlock_irq_restore(&flags);
 
        return 0;
 }
@@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
        struct perf_counter *counter = info;
+       unsigned long flags;
 
+       curr_rq_lock_irq_save(&flags);
        counter->hw_ops->read(counter);
+       curr_rq_unlock_irq_restore(&flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
        .read           = cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 {
-       u64 prev, now;
+       struct task_struct *curr = counter->task;
+       u64 delta;
+
+       WARN_ON_ONCE(counter->task != current);
+
+       delta = __task_delta_exec(curr, update);
+
+       return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+       u64 prev;
        s64 delta;
 
        prev = atomic64_read(&counter->hw.prev_count);
-       now = current->se.sum_exec_runtime;
 
        atomic64_set(&counter->hw.prev_count, now);
 
@@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-       task_clock_perf_counter_update(counter);
+       u64 now = task_clock_perf_counter_val(counter, 1);
+
+       task_clock_perf_counter_update(counter, now);
 }
 
 static void task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-       atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+       u64 now = task_clock_perf_counter_val(counter, 0);
+
+       atomic64_set(&counter->hw.prev_count, now);
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-       task_clock_perf_counter_update(counter);
+       u64 now = task_clock_perf_counter_val(counter, 0);
+
+       task_clock_perf_counter_update(counter, now);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child,
 {
        struct perf_counter *parent_counter;
        u64 parent_val, child_val;
+       unsigned long flags;
        u64 perf_flags;
 
        /*
@@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child,
         * Be careful about zapping the list - IRQ/NMI context
         * could still be processing it:
         */
-       local_irq_disable();
+       curr_rq_lock_irq_save(&flags);
        perf_flags = hw_perf_save_disable();
 
        if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
@@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child,
        list_del_init(&child_counter->list_entry);
 
        hw_perf_restore(perf_flags);
-       local_irq_enable();
+       curr_rq_unlock_irq_restore(&flags);
 
        parent_counter = child_counter->parent;
        /*
index 382cfdb..4d84ff4 100644 (file)
@@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)             cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       local_irq_save(*flags);
+       rq = cpu_rq(smp_processor_id());
+       spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+       __releases(rq->lock)
+{
+       struct rq *rq;
+
+       rq = cpu_rq(smp_processor_id());
+       spin_unlock(&rq->lock);
+       local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
@@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        fire_sched_out_preempt_notifiers(prev, next);
-       perf_counter_task_sched_out(prev, cpu_of(rq));
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
 }
@@ -4093,6 +4112,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+       s64 delta_exec;
+       struct rq *rq;
+
+       rq = task_rq(p);
+       WARN_ON_ONCE(!runqueue_is_locked());
+       WARN_ON_ONCE(!task_current(rq, p));
+
+       if (update)
+               update_rq_clock(rq);
+
+       delta_exec = rq->clock - p->se.exec_start;
+
+       WARN_ON_ONCE(delta_exec < 0);
+
+       return delta_exec;
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
@@ -4316,13 +4358,13 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
+       perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
-       perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4512,6 +4554,7 @@ need_resched_nonpreemptible:
 
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+               perf_counter_task_sched_out(prev, cpu);
 
                rq->nr_switches++;
                rq->curr = next;