Merge branch 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6.git] / kernel / trace / ring_buffer.c
index c3dab05..9ed509a 100644 (file)
@@ -224,6 +224,9 @@ enum {
        RB_LEN_TIME_STAMP = 16,
 };
 
+#define skip_time_extend(event) \
+       ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
        return length + RB_EVNT_HDR_SIZE;
 }
 
-/* inline for ring buffer fast paths */
-static unsigned
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
        switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
        return 0;
 }
 
+/*
+ * Return total length of time extend and data,
+ *   or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+       unsigned len = 0;
+
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+               /* time extends include the data event after it */
+               len = RB_LEN_TIME_EXTEND;
+               event = skip_time_extend(event);
+       }
+       return len + rb_event_length(event);
+}
+
 /**
  * ring_buffer_event_length - return the length of the event
  * @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
  */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-       unsigned length = rb_event_length(event);
+       unsigned length;
+
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
+
+       length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
-
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
        struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
 
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+       /* Not the first event on the page? */
+       if (rb_event_index(event)) {
+               event->time_delta = delta & TS_MASK;
+               event->array[0] = delta >> TS_SHIFT;
+       } else {
+               /* nope, just zero it */
+               event->time_delta = 0;
+               event->array[0] = 0;
+       }
+
+       return skip_time_extend(event);
+}
+
 /**
  * ring_buffer_update_event - update event type and data
  * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
  * data field.
  */
 static void
-rb_update_event(struct ring_buffer_event *event,
-                        unsigned type, unsigned length)
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+               struct ring_buffer_event *event, unsigned length,
+               int add_timestamp, u64 delta)
 {
-       event->type_len = type;
-
-       switch (type) {
-
-       case RINGBUF_TYPE_PADDING:
-       case RINGBUF_TYPE_TIME_EXTEND:
-       case RINGBUF_TYPE_TIME_STAMP:
-               break;
+       /* Only a commit updates the timestamp */
+       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+               delta = 0;
 
-       case 0:
-               length -= RB_EVNT_HDR_SIZE;
-               if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-                       event->array[0] = length;
-               else
-                       event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-               break;
-       default:
-               BUG();
+       /*
+        * If we need to add a timestamp, then we
+        * add it to the start of the resevered space.
+        */
+       if (unlikely(add_timestamp)) {
+               event = rb_add_time_stamp(event, delta);
+               length -= RB_LEN_TIME_EXTEND;
+               delta = 0;
        }
+
+       event->time_delta = delta;
+       length -= RB_EVNT_HDR_SIZE;
+       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+               event->type_len = 0;
+               event->array[0] = length;
+       } else
+               event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 }
 
 /*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        local_sub(length, &tail_page->write);
 }
 
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-            struct buffer_page *tail_page, u64 *ts)
+            struct buffer_page *tail_page, u64 ts)
 {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-               *ts = rb_time_stamp(buffer);
-               next_page->page->time_stamp = *ts;
+               ts = rb_time_stamp(buffer);
+               next_page->page->time_stamp = ts;
        }
 
  out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned type, unsigned long length, u64 *ts)
+                 unsigned long length, u64 ts,
+                 u64 delta, int add_timestamp)
 {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
 
+       /*
+        * If the time delta since the last event is too big to
+        * hold in the time field of the event, then we append a
+        * TIME EXTEND event ahead of the data event.
+        */
+       if (unlikely(add_timestamp))
+               length += RB_LEN_TIME_EXTEND;
+
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
 
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        tail = write - length;
 
        /* See if we shot pass the end of this buffer page */
-       if (write > BUF_PAGE_SIZE)
+       if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
 
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-       rb_update_event(event, type, length);
+       rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
 
-       /* The passed in type is zero for DATA */
-       if (likely(!type))
-               local_inc(&tail_page->entries);
+       local_inc(&tail_page->entries);
 
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-               tail_page->page->time_stamp = *ts;
+               tail_page->page->time_stamp = ts;
 
        return event;
 }
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        unsigned long addr;
 
        new_index = rb_event_index(event);
-       old_index = new_index + rb_event_length(event);
+       old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
 
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
 
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                 u64 *ts, u64 *delta)
-{
-       struct ring_buffer_event *event;
-       int ret;
-
-       WARN_ONCE(*delta > (1ULL << 59),
-                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                 (unsigned long long)*delta,
-                 (unsigned long long)*ts,
-                 (unsigned long long)cpu_buffer->write_stamp);
-
-       /*
-        * The delta is too big, we to add a
-        * new timestamp.
-        */
-       event = __rb_reserve_next(cpu_buffer,
-                                 RINGBUF_TYPE_TIME_EXTEND,
-                                 RB_LEN_TIME_EXTEND,
-                                 ts);
-       if (!event)
-               return -EBUSY;
-
-       if (PTR_ERR(event) == -EAGAIN)
-               return -EAGAIN;
-
-       /* Only a commited time event can update the write stamp */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * If this is the first on the page, then it was
-                * updated with the page itself. Try to discard it
-                * and if we can't just make it zero.
-                */
-               if (rb_event_index(event)) {
-                       event->time_delta = *delta & TS_MASK;
-                       event->array[0] = *delta >> TS_SHIFT;
-               } else {
-                       /* try to discard, since we do not need this */
-                       if (!rb_try_to_discard(cpu_buffer, event)) {
-                               /* nope, just zero it */
-                               event->time_delta = 0;
-                               event->array[0] = 0;
-                       }
-               }
-               cpu_buffer->write_stamp = *ts;
-               /* let the caller know this was the commit */
-               ret = 1;
-       } else {
-               /* Try to discard the event */
-               if (!rb_try_to_discard(cpu_buffer, event)) {
-                       /* Darn, this is just wasted space */
-                       event->time_delta = 0;
-                       event->array[0] = 0;
-               }
-               ret = 0;
-       }
-
-       *delta = 0;
-
-       return ret;
-}
-
 static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
 }
 
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
        unsigned long commits;
 
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                      unsigned long length)
 {
        struct ring_buffer_event *event;
-       u64 ts, delta = 0;
-       int commit = 0;
+       u64 ts, delta;
        int nr_loops = 0;
+       int add_timestamp;
+       u64 diff;
 
        rb_start_commit(cpu_buffer);
 
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 
        length = rb_calculate_event_length(length);
  again:
+       add_timestamp = 0;
+       delta = 0;
+
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
                goto out_fail;
 
        ts = rb_time_stamp(cpu_buffer->buffer);
+       diff = ts - cpu_buffer->write_stamp;
 
-       /*
-        * Only the first commit can update the timestamp.
-        * Yes there is a race here. If an interrupt comes in
-        * just after the conditional and it traces too, then it
-        * will also check the deltas. More than one timestamp may
-        * also be made. But only the entry that did the actual
-        * commit will be something other than zero.
-        */
-       if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                  rb_page_write(cpu_buffer->tail_page) ==
-                  rb_commit_index(cpu_buffer))) {
-               u64 diff;
-
-               diff = ts - cpu_buffer->write_stamp;
-
-               /* make sure this diff is calculated here */
-               barrier();
-
-               /* Did the write stamp get updated already? */
-               if (unlikely(ts < cpu_buffer->write_stamp))
-                       goto get_event;
+       /* make sure this diff is calculated here */
+       barrier();
 
+       /* Did the write stamp get updated already? */
+       if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
-
-                       commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-                       if (commit == -EBUSY)
-                               goto out_fail;
-
-                       if (commit == -EAGAIN)
-                               goto again;
-
-                       RB_WARN_ON(cpu_buffer, commit < 0);
+                       WARN_ONCE(delta > (1ULL << 59),
+                                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+                                 (unsigned long long)delta,
+                                 (unsigned long long)ts,
+                                 (unsigned long long)cpu_buffer->write_stamp);
+                       add_timestamp = 1;
                }
        }
 
- get_event:
-       event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+       event = __rb_reserve_next(cpu_buffer, length, ts,
+                                 delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
 
        if (!event)
                goto out_fail;
 
-       if (!rb_event_is_commit(cpu_buffer, event))
-               delta = 0;
-
-       event->time_delta = delta;
-
        return event;
 
  out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 
 #define TRACE_RECURSIVE_DEPTH 16
 
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
 {
-       current->trace_recursion++;
-
-       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-               return 0;
-
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
 
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
                    in_nmi());
 
        WARN_ON_ONCE(1);
+}
+
+static inline int trace_recursive_lock(void)
+{
+       current->trace_recursion++;
+
+       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+               return 0;
+
+       trace_recursive_fail();
+
        return -1;
 }
 
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
 {
        WARN_ON_ONCE(!current->trace_recursion);
 
@@ -2308,12 +2298,28 @@ static void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
+       u64 delta;
+
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-       if (rb_event_is_commit(cpu_buffer, event))
-               cpu_buffer->write_stamp += event->time_delta;
+       if (rb_event_is_commit(cpu_buffer, event)) {
+               /*
+                * A commit event that is first on a page
+                * updates the write timestamp with the page stamp
+                */
+               if (!rb_event_index(event))
+                       cpu_buffer->write_stamp =
+                               cpu_buffer->commit_page->page->time_stamp;
+               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                       delta = event->array[0];
+                       delta <<= TS_SHIFT;
+                       delta += event->time_delta;
+                       cpu_buffer->write_stamp += delta;
+               } else
+                       cpu_buffer->write_stamp += event->time_delta;
+       }
 }
 
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
+
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 
  again:
        /*
-        * We repeat when a timestamp is encountered. It is possible
-        * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written, or from discarded
-        * commits. The most that we can have is the number on a single page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
 
        reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
                return NULL;
 
        /*
-        * We repeat when a timestamp is encountered.
-        * We can get multiple timestamps by nested interrupts or also
-        * if filtering is on (discarding commits). Since discarding
-        * commits can be frequent we can get a lot of timestamps.
-        * But we limit them by not adding timestamps if they begin
-        * at the start of a page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
 
        if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                if (len > (commit - read))
                        len = (commit - read);
 
-               size = rb_event_length(event);
+               /* Always keep the time extend and data together */
+               size = rb_event_ts_length(event);
 
                if (len < size)
                        goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                                break;
 
                        event = rb_reader_event(cpu_buffer);
-                       size = rb_event_length(event);
+                       /* Always keep the time extend and data together */
+                       size = rb_event_ts_length(event);
                } while (len > size);
 
                /* update bpage */