[PATCH] Add debugging feature /proc/timer_stat
Ingo Molnar [Fri, 16 Feb 2007 09:28:13 +0000 (01:28 -0800)]
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.

Sample output:

# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
  24,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
  11,     0 swapper          sk_reset_timer (tcp_delack_timer)
   6,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
   2,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
  17,     0 swapper          hrtimer_restart_sched_tick (hrtimer_sched_tick)
   2,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
   4,  2050 pcscd            do_nanosleep (hrtimer_wakeup)
   5,  4179 sshd             sk_reset_timer (tcp_write_timer)
   4,  2248 yum-updatesd     schedule_timeout (process_timeout)
  18,     0 swapper          hrtimer_restart_sched_tick (hrtimer_sched_tick)
   3,     0 swapper          sk_reset_timer (tcp_delack_timer)
   1,     1 swapper          neigh_table_init_no_netlink (neigh_periodic_timer)
   2,     1 swapper          e1000_up (e1000_watchdog)
   1,     1 init             schedule_timeout (process_timeout)
100 total events, 25.24 events/sec

[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Documentation/hrtimer/timer_stats.txt [new file with mode: 0644]
include/linux/hrtimer.h
include/linux/timer.h
kernel/hrtimer.c
kernel/time/Makefile
kernel/time/timer_stats.c [new file with mode: 0644]
kernel/timer.c
kernel/workqueue.c
lib/Kconfig.debug

diff --git a/Documentation/hrtimer/timer_stats.txt b/Documentation/hrtimer/timer_stats.txt
new file mode 100644 (file)
index 0000000..27f782e
--- /dev/null
@@ -0,0 +1,68 @@
+timer_stats - timer usage statistics
+------------------------------------
+
+timer_stats is a debugging facility to make the timer (ab)usage in a Linux
+system visible to kernel and userspace developers. It is not intended for
+production usage as it adds significant overhead to the (hr)timer code and the
+(hr)timer data structures.
+
+timer_stats should be used by kernel and userspace developers to verify that
+their code does not make unduly use of timers. This helps to avoid unnecessary
+wakeups, which should be avoided to optimize power consumption.
+
+It can be enabled by CONFIG_TIMER_STATS in the "Kernel hacking" configuration
+section.
+
+timer_stats collects information about the timer events which are fired in a
+Linux system over a sample period:
+
+- the pid of the task(process) which initialized the timer
+- the name of the process which initialized the timer
+- the function where the timer was intialized
+- the callback function which is associated to the timer
+- the number of events (callbacks)
+
+timer_stats adds an entry to /proc: /proc/timer_stats
+
+This entry is used to control the statistics functionality and to read out the
+sampled information.
+
+The timer_stats functionality is inactive on bootup.
+
+To activate a sample period issue:
+# echo 1 >/proc/timer_stats
+
+To stop a sample period issue:
+# echo 0 >/proc/timer_stats
+
+The statistics can be retrieved by:
+# cat /proc/timer_stats
+
+The readout of /proc/timer_stats automatically disables sampling. The sampled
+information is kept until a new sample period is started. This allows multiple
+readouts.
+
+Sample output of /proc/timer_stats:
+
+Timerstats sample period: 3.888770 s
+  12,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
+  15,     1 swapper          hcd_submit_urb (rh_timer_func)
+   4,   959 kedac            schedule_timeout (process_timeout)
+   1,     0 swapper          page_writeback_init (wb_timer_fn)
+  28,     0 swapper          hrtimer_stop_sched_tick (hrtimer_sched_tick)
+  22,  2948 IRQ 4            tty_flip_buffer_push (delayed_work_timer_fn)
+   3,  3100 bash             schedule_timeout (process_timeout)
+   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
+   1,     1 swapper          queue_delayed_work_on (delayed_work_timer_fn)
+   1,     1 swapper          neigh_table_init_no_netlink (neigh_periodic_timer)
+   1,  2292 ip               __netdev_watchdog_up (dev_watchdog)
+   1,    23 events/1         do_cache_clean (delayed_work_timer_fn)
+90 total events, 30.0 events/sec
+
+The first column is the number of events, the second column the pid, the third
+column is the name of the process. The forth column shows the function which
+initialized the timer and in parantheses the callback function which was
+executed on expiry.
+
+    Thomas, Ingo
+
index 4ecd991..93b8757 100644 (file)
@@ -119,6 +119,11 @@ struct hrtimer {
        enum hrtimer_cb_mode            cb_mode;
        struct list_head                cb_entry;
 #endif
+#ifdef CONFIG_TIMER_STATS
+       void                            *start_site;
+       char                            start_comm[16];
+       int                             start_pid;
+#endif
 };
 
 /**
@@ -311,4 +316,44 @@ extern unsigned long ktime_divns(const ktime_t kt, s64 div);
 # define ktime_divns(kt, div)          (unsigned long)((kt).tv64 / (div))
 #endif
 
+/*
+ * Timer-statistics info:
+ */
+#ifdef CONFIG_TIMER_STATS
+
+extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+                                    void *timerf, char * comm);
+
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+       timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                timer->function, timer->start_comm);
+}
+
+extern void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer,
+                                                void *addr);
+
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
+{
+       __timer_stats_hrtimer_set_start_info(timer, __builtin_return_address(0));
+}
+
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+       timer->start_site = NULL;
+}
+#else
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+}
+
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
+{
+}
+
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+}
+#endif
+
 #endif
index 44d41e9..719113b 100644 (file)
@@ -2,6 +2,7 @@
 #define _LINUX_TIMER_H
 
 #include <linux/list.h>
+#include <linux/ktime.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
@@ -15,6 +16,11 @@ struct timer_list {
        unsigned long data;
 
        struct tvec_t_base_s *base;
+#ifdef CONFIG_TIMER_STATS
+       void *start_site;
+       char start_comm[16];
+       int start_pid;
+#endif
 };
 
 extern struct tvec_t_base_s boot_tvec_bases;
@@ -73,6 +79,54 @@ extern unsigned long next_timer_interrupt(void);
  */
 extern unsigned long get_next_timer_interrupt(unsigned long now);
 
+/*
+ * Timer-statistics info:
+ */
+#ifdef CONFIG_TIMER_STATS
+
+extern void init_timer_stats(void);
+
+extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+                                    void *timerf, char * comm);
+
+static inline void timer_stats_account_timer(struct timer_list *timer)
+{
+       timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                timer->function, timer->start_comm);
+}
+
+extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
+                                              void *addr);
+
+static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
+{
+       __timer_stats_timer_set_start_info(timer, __builtin_return_address(0));
+}
+
+static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
+{
+       timer->start_site = NULL;
+}
+#else
+static inline void init_timer_stats(void)
+{
+}
+
+static inline void timer_stats_account_timer(struct timer_list *timer)
+{
+}
+
+static inline void timer_stats_timer_set_start_info(struct timer_list *timer)
+{
+}
+
+static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
+{
+}
+#endif
+
+extern void delayed_work_timer_fn(unsigned long __data);
+
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
index 62aad8e..476cb0c 100644 (file)
@@ -585,6 +585,18 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
+{
+       if (timer->start_site)
+               return;
+
+       timer->start_site = addr;
+       memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+       timer->start_pid = current->pid;
+}
+#endif
+
 /*
  * Counterpart to lock_timer_base above:
  */
@@ -743,6 +755,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
+               timer_stats_hrtimer_clear_start_info(timer);
                reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
                __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
                                 reprogram);
@@ -791,6 +804,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
        }
        timer->expires = tim;
 
+       timer_stats_hrtimer_set_start_info(timer);
+
        enqueue_hrtimer(timer, new_base, base == new_base);
 
        unlock_hrtimer_base(timer, &flags);
@@ -925,6 +940,12 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 
        timer->base = &cpu_base->clock_base[clock_id];
        hrtimer_init_timer_hres(timer);
+
+#ifdef CONFIG_TIMER_STATS
+       timer->start_site = NULL;
+       timer->start_pid = -1;
+       memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
@@ -1006,6 +1027,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
                        __remove_hrtimer(timer, base,
                                         HRTIMER_STATE_CALLBACK, 0);
+                       timer_stats_account_hrtimer(timer);
 
                        /*
                         * Note: We clear the CALLBACK bit after
@@ -1050,6 +1072,8 @@ static void run_hrtimer_softirq(struct softirq_action *h)
                timer = list_entry(cpu_base->cb_pending.next,
                                   struct hrtimer, cb_entry);
 
+               timer_stats_account_hrtimer(timer);
+
                fn = timer->function;
                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
                spin_unlock_irq(&cpu_base->lock);
@@ -1106,6 +1130,8 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
                if (base->softirq_time.tv64 <= timer->expires.tv64)
                        break;
 
+               timer_stats_account_hrtimer(timer);
+
                fn = timer->function;
                __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
                spin_unlock_irq(&cpu_base->lock);
index f246bc8..252b4b5 100644 (file)
@@ -5,3 +5,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS)               += tick-common.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)    += tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)                     += tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)                     += tick-sched.o
+obj-$(CONFIG_TIMER_STATS)                      += timer_stats.o
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
new file mode 100644 (file)
index 0000000..1bc4882
--- /dev/null
@@ -0,0 +1,411 @@
+/*
+ * kernel/time/timer_stats.c
+ *
+ * Collect timer usage statistics.
+ *
+ * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar
+ * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * timer_stats is based on timer_top, a similar functionality which was part of
+ * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the
+ * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based
+ * on dynamic allocation of the statistics entries and linear search based
+ * lookup combined with a global lock, rather than the static array, hash
+ * and per-CPU locking which is used by timer_stats. It was written for the
+ * pre hrtimer kernel code and therefore did not take hrtimers into account.
+ * Nevertheless it provided the base for the timer_stats implementation and
+ * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks
+ * for this effort.
+ *
+ * timer_top.c is
+ *     Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus
+ *     Written by Daniel Petrini <d.pensator@gmail.com>
+ *     timer_top.c was released under the GNU General Public License version 2
+ *
+ * We export the addresses and counting of timer functions being called,
+ * the pid and cmdline from the owner process if applicable.
+ *
+ * Start/stop data collection:
+ * # echo 1[0] >/proc/timer_stats
+ *
+ * Display the information collected so far:
+ * # cat /proc/timer_stats
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * This is our basic unit of interest: a timer expiry event identified
+ * by the timer, its start/expire functions and the PID of the task that
+ * started the timer. We count the number of times an event happens:
+ */
+struct entry {
+       /*
+        * Hash list:
+        */
+       struct entry            *next;
+
+       /*
+        * Hash keys:
+        */
+       void                    *timer;
+       void                    *start_func;
+       void                    *expire_func;
+       pid_t                   pid;
+
+       /*
+        * Number of timeout events:
+        */
+       unsigned long           count;
+
+       /*
+        * We save the command-line string to preserve
+        * this information past task exit:
+        */
+       char                    comm[TASK_COMM_LEN + 1];
+
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Spinlock protecting the tables - not taken during lookup:
+ */
+static DEFINE_SPINLOCK(table_lock);
+
+/*
+ * Per-CPU lookup locks for fast hash lookup:
+ */
+static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+
+/*
+ * Mutex to serialize state changes with show-stats activities:
+ */
+static DEFINE_MUTEX(show_mutex);
+
+/*
+ * Collection status, active/inactive:
+ */
+static int __read_mostly active;
+
+/*
+ * Beginning/end timestamps of measurement:
+ */
+static ktime_t time_start, time_stop;
+
+/*
+ * tstat entry structs only get allocated while collection is
+ * active and never freed during that time - this simplifies
+ * things quite a bit.
+ *
+ * They get freed when a new collection period is started.
+ */
+#define MAX_ENTRIES_BITS       10
+#define MAX_ENTRIES            (1UL << MAX_ENTRIES_BITS)
+
+static unsigned long nr_entries;
+static struct entry entries[MAX_ENTRIES];
+
+static atomic_t overflow_count;
+
+static void reset_entries(void)
+{
+       nr_entries = 0;
+       memset(entries, 0, sizeof(entries));
+       atomic_set(&overflow_count, 0);
+}
+
+static struct entry *alloc_entry(void)
+{
+       if (nr_entries >= MAX_ENTRIES)
+               return NULL;
+
+       return entries + nr_entries++;
+}
+
+/*
+ * The entries are in a hash-table, for fast lookup:
+ */
+#define TSTAT_HASH_BITS                (MAX_ENTRIES_BITS - 1)
+#define TSTAT_HASH_SIZE                (1UL << TSTAT_HASH_BITS)
+#define TSTAT_HASH_MASK                (TSTAT_HASH_SIZE - 1)
+
+#define __tstat_hashfn(entry)                                          \
+       (((unsigned long)(entry)->timer       ^                         \
+         (unsigned long)(entry)->start_func  ^                         \
+         (unsigned long)(entry)->expire_func ^                         \
+         (unsigned long)(entry)->pid           ) & TSTAT_HASH_MASK)
+
+#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
+
+static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+
+static int match_entries(struct entry *entry1, struct entry *entry2)
+{
+       return entry1->timer       == entry2->timer       &&
+              entry1->start_func  == entry2->start_func  &&
+              entry1->expire_func == entry2->expire_func &&
+              entry1->pid         == entry2->pid;
+}
+
+/*
+ * Look up whether an entry matching this item is present
+ * in the hash already. Must be called with irqs off and the
+ * lookup lock held:
+ */
+static struct entry *tstat_lookup(struct entry *entry, char *comm)
+{
+       struct entry **head, *curr, *prev;
+
+       head = tstat_hashentry(entry);
+       curr = *head;
+
+       /*
+        * The fastpath is when the entry is already hashed,
+        * we do this with the lookup lock held, but with the
+        * table lock not held:
+        */
+       while (curr) {
+               if (match_entries(curr, entry))
+                       return curr;
+
+               curr = curr->next;
+       }
+       /*
+        * Slowpath: allocate, set up and link a new hash entry:
+        */
+       prev = NULL;
+       curr = *head;
+
+       spin_lock(&table_lock);
+       /*
+        * Make sure we have not raced with another CPU:
+        */
+       while (curr) {
+               if (match_entries(curr, entry))
+                       goto out_unlock;
+
+               prev = curr;
+               curr = curr->next;
+       }
+
+       curr = alloc_entry();
+       if (curr) {
+               *curr = *entry;
+               curr->count = 0;
+               memcpy(curr->comm, comm, TASK_COMM_LEN);
+               if (prev)
+                       prev->next = curr;
+               else
+                       *head = curr;
+               curr->next = NULL;
+       }
+ out_unlock:
+       spin_unlock(&table_lock);
+
+       return curr;
+}
+
+/**
+ * timer_stats_update_stats - Update the statistics for a timer.
+ * @timer:     pointer to either a timer_list or a hrtimer
+ * @pid:       the pid of the task which set up the timer
+ * @startf:    pointer to the function which did the timer setup
+ * @timerf:    pointer to the timer callback function of the timer
+ * @comm:      name of the process which set up the timer
+ *
+ * When the timer is already registered, then the event counter is
+ * incremented. Otherwise the timer is registered in a free slot.
+ */
+void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
+                             void *timerf, char * comm)
+{
+       /*
+        * It doesnt matter which lock we take:
+        */
+       spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+       struct entry *entry, input;
+       unsigned long flags;
+
+       input.timer = timer;
+       input.start_func = startf;
+       input.expire_func = timerf;
+       input.pid = pid;
+
+       spin_lock_irqsave(lock, flags);
+       if (!active)
+               goto out_unlock;
+
+       entry = tstat_lookup(&input, comm);
+       if (likely(entry))
+               entry->count++;
+       else
+               atomic_inc(&overflow_count);
+
+ out_unlock:
+       spin_unlock_irqrestore(lock, flags);
+}
+
+static void print_name_offset(struct seq_file *m, unsigned long addr)
+{
+       char namebuf[KSYM_NAME_LEN+1];
+       unsigned long size, offset;
+       const char *sym_name;
+       char *modname;
+
+       sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
+       if (sym_name)
+               seq_printf(m, "%s", sym_name);
+       else
+               seq_printf(m, "<%p>", (void *)addr);
+}
+
+static int tstats_show(struct seq_file *m, void *v)
+{
+       struct timespec period;
+       struct entry *entry;
+       unsigned long ms;
+       long events = 0;
+       ktime_t time;
+       int i;
+
+       mutex_lock(&show_mutex);
+       /*
+        * If still active then calculate up to now:
+        */
+       if (active)
+               time_stop = ktime_get();
+
+       time = ktime_sub(time_stop, time_start);
+
+       period = ktime_to_timespec(time);
+       ms = period.tv_nsec / 1000000;
+
+       seq_puts(m, "Timer Stats Version: v0.1\n");
+       seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
+       if (atomic_read(&overflow_count))
+               seq_printf(m, "Overflow: %d entries\n",
+                       atomic_read(&overflow_count));
+
+       for (i = 0; i < nr_entries; i++) {
+               entry = entries + i;
+               seq_printf(m, "%4lu, %5d %-16s ",
+                               entry->count, entry->pid, entry->comm);
+
+               print_name_offset(m, (unsigned long)entry->start_func);
+               seq_puts(m, " (");
+               print_name_offset(m, (unsigned long)entry->expire_func);
+               seq_puts(m, ")\n");
+
+               events += entry->count;
+       }
+
+       ms += period.tv_sec * 1000;
+       if (!ms)
+               ms = 1;
+
+       if (events && period.tv_sec)
+               seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
+                          events / period.tv_sec, events * 1000 / ms);
+       else
+               seq_printf(m, "%ld total events\n", events);
+
+       mutex_unlock(&show_mutex);
+
+       return 0;
+}
+
+/*
+ * After a state change, make sure all concurrent lookup/update
+ * activities have stopped:
+ */
+static void sync_access(void)
+{
+       unsigned long flags;
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+               /* nothing */
+               spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+       }
+}
+
+static ssize_t tstats_write(struct file *file, const char __user *buf,
+                           size_t count, loff_t *offs)
+{
+       char ctl[2];
+
+       if (count != 2 || *offs)
+               return -EINVAL;
+
+       if (copy_from_user(ctl, buf, count))
+               return -EFAULT;
+
+       mutex_lock(&show_mutex);
+       switch (ctl[0]) {
+       case '0':
+               if (active) {
+                       active = 0;
+                       time_stop = ktime_get();
+                       sync_access();
+               }
+               break;
+       case '1':
+               if (!active) {
+                       reset_entries();
+                       time_start = ktime_get();
+                       active = 1;
+               }
+               break;
+       default:
+               count = -EINVAL;
+       }
+       mutex_unlock(&show_mutex);
+
+       return count;
+}
+
+static int tstats_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, tstats_show, NULL);
+}
+
+static struct file_operations tstats_fops = {
+       .open           = tstats_open,
+       .read           = seq_read,
+       .write          = tstats_write,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+void __init init_timer_stats(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               spin_lock_init(&per_cpu(lookup_lock, cpu));
+}
+
+static int __init init_tstats_procfs(void)
+{
+       struct proc_dir_entry *pe;
+
+       pe = create_proc_entry("timer_stats", 0644, NULL);
+       if (!pe)
+               return -ENOMEM;
+
+       pe->proc_fops = &tstats_fops;
+
+       return 0;
+}
+__initcall(init_tstats_procfs);
index f058e6c..c3c67f3 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/delay.h>
 #include <linux/tick.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -263,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
 
+#ifdef CONFIG_TIMER_STATS
+void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
+{
+       if (timer->start_site)
+               return;
+
+       timer->start_site = addr;
+       memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
+       timer->start_pid = current->pid;
+}
+#endif
+
 /**
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
@@ -274,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
+#ifdef CONFIG_TIMER_STATS
+       timer->start_site = NULL;
+       timer->start_pid = -1;
+       memset(timer->start_comm, 0, TASK_COMM_LEN);
+#endif
 }
 EXPORT_SYMBOL(init_timer);
 
 static inline void detach_timer(struct timer_list *timer,
-                                       int clear_pending)
+                               int clear_pending)
 {
        struct list_head *entry = &timer->entry;
 
@@ -325,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
        unsigned long flags;
        int ret = 0;
 
+       timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
 
        base = lock_timer_base(timer, &flags);
@@ -375,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
        tvec_base_t *base = per_cpu(tvec_bases, cpu);
        unsigned long flags;
 
+       timer_stats_timer_set_start_info(timer);
        BUG_ON(timer_pending(timer) || !timer->function);
        spin_lock_irqsave(&base->lock, flags);
        timer->base = base;
@@ -407,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
        BUG_ON(!timer->function);
 
+       timer_stats_timer_set_start_info(timer);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -437,6 +458,7 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
 
+       timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
                if (timer_pending(timer)) {
@@ -570,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base)
                        fn = timer->function;
                        data = timer->data;
 
+                       timer_stats_account_timer(timer);
+
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
@@ -1229,7 +1253,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        tvec_base_t *base = __get_cpu_var(tvec_bases);
 
-       hrtimer_run_queues();
+       hrtimer_run_queues();
+
        if (time_after_eq(jiffies, base->timer_jiffies))
                __run_timers(base);
 }
@@ -1675,6 +1700,8 @@ void __init init_timers(void)
        int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                                (void *)(long)smp_processor_id());
 
+       init_timer_stats();
+
        BUG_ON(err == NOTIFY_BAD);
        register_cpu_notifier(&timers_nb);
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
index 020d1ff..b6fa5e6 100644 (file)
@@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
-static void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
        struct workqueue_struct *wq = get_wq_data(&dwork->work);
@@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
 
+       timer_stats_timer_set_start_info(timer);
        if (delay == 0)
                return queue_work(wq, work);
 
@@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work);
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue.
  */
-int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork,
+                                       unsigned long delay)
 {
+       timer_stats_timer_set_start_info(&dwork->timer);
        return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
index 63f04c1..4448f91 100644 (file)
@@ -134,6 +134,17 @@ config SCHEDSTATS
          application, you can say N to avoid the very slight overhead
          this adds.
 
+config TIMER_STATS
+       bool "Collect kernel timers statistics"
+       depends on DEBUG_KERNEL && PROC_FS
+       help
+         If you say Y here, additional code will be inserted into the
+         timer routines to collect statistics about kernel timers being
+         reprogrammed. The statistics can be read from /proc/timer_stats.
+         The statistics collection is started by writing 1 to /proc/timer_stats,
+         writing 0 stops it. This feature is useful to collect information
+         about timer usage patterns in kernel and userspace.
+
 config DEBUG_SLAB
        bool "Debug slab memory allocations"
        depends on DEBUG_KERNEL && SLAB