Merge branch 'tip/perf/ringbuffer-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent

This commit is contained in:
Ingo Molnar 2010-10-26 13:14:02 +02:00
commit 5c16d2c813
2 changed files with 172 additions and 175 deletions

View file

@ -62,18 +62,6 @@ enum ring_buffer_type {
unsigned ring_buffer_event_length(struct ring_buffer_event *event);
void *ring_buffer_event_data(struct ring_buffer_event *event);
/**
* ring_buffer_event_time_delta - return the delta timestamp of the event
* @event: the event to get the delta timestamp of
*
* The delta timestamp is the 27 bit timestamp since the last event.
*/
static inline unsigned
ring_buffer_event_time_delta(struct ring_buffer_event *event)
{
return event->time_delta;
}
/*
* ring_buffer_discard_commit will remove an event that has not
* ben committed yet. If this is used, then ring_buffer_unlock_commit

View file

@ -224,6 +224,9 @@ enum {
RB_LEN_TIME_STAMP = 16,
};
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
return length + RB_EVNT_HDR_SIZE;
}
/* inline for ring buffer fast paths */
static unsigned
/*
* Return the length of the given event. Will return
* the length of the time extend if the event is a
* time extend.
*/
static inline unsigned
rb_event_length(struct ring_buffer_event *event)
{
switch (event->type_len) {
@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
return 0;
}
/*
* Return total length of time extend and data,
* or just the event length for all other events.
*/
static inline unsigned
rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
}
return len + rb_event_length(event);
}
/**
* ring_buffer_event_length - return the length of the event
* @event: the event to get the length of
*
* Returns the size of the data load of a data event.
* If the event is something other than a data event, it
* returns the size of the event itself. With the exception
* of a TIME EXTEND, where it still returns the size of the
* data load of the data event after it.
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
unsigned length = rb_event_length(event);
unsigned length;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void *
rb_event_data(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
if (event->type_len)
@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
/* Max number of timestamps that can fit on a page */
#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
/* Slow path, do not inline */
static noinline struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
{
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page? */
if (rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
/* nope, just zero it */
event->time_delta = 0;
event->array[0] = 0;
}
return skip_time_extend(event);
}
/**
* ring_buffer_update_event - update event type and data
* @event: the even to update
@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* data field.
*/
static void
rb_update_event(struct ring_buffer_event *event,
unsigned type, unsigned length)
rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event, unsigned length,
int add_timestamp, u64 delta)
{
event->type_len = type;
/* Only a commit updates the timestamp */
if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
delta = 0;
switch (type) {
case RINGBUF_TYPE_PADDING:
case RINGBUF_TYPE_TIME_EXTEND:
case RINGBUF_TYPE_TIME_STAMP:
break;
case 0:
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
event->array[0] = length;
else
event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
break;
default:
BUG();
/*
* If we need to add a timestamp, then we
* add it to the start of the resevered space.
*/
if (unlikely(add_timestamp)) {
event = rb_add_time_stamp(event, delta);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
}
/*
@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
static struct ring_buffer_event *
/*
* This is the slow path, force gcc not to inline it.
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length, unsigned long tail,
struct buffer_page *tail_page, u64 *ts)
struct buffer_page *tail_page, u64 ts)
{
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Nested commits always have zero deltas, so
* just reread the time stamp
*/
*ts = rb_time_stamp(buffer);
next_page->page->time_stamp = *ts;
ts = rb_time_stamp(buffer);
next_page->page->time_stamp = ts;
}
out_again:
@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
unsigned type, unsigned long length, u64 *ts)
unsigned long length, u64 ts,
u64 delta, int add_timestamp)
{
struct buffer_page *tail_page;
struct ring_buffer_event *event;
unsigned long tail, write;
/*
* If the time delta since the last event is too big to
* hold in the time field of the event, then we append a
* TIME EXTEND event ahead of the data event.
*/
if (unlikely(add_timestamp))
length += RB_LEN_TIME_EXTEND;
tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write);
@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - length;
/* See if we shot pass the end of this buffer page */
if (write > BUF_PAGE_SIZE)
if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
tail_page, ts);
@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(event, type, length);
rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
/* The passed in type is zero for DATA */
if (likely(!type))
local_inc(&tail_page->entries);
local_inc(&tail_page->entries);
/*
* If this is the first commit on the page, then update
* its timestamp.
*/
if (!tail)
tail_page->page->time_stamp = *ts;
tail_page->page->time_stamp = ts;
return event;
}
@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long addr;
new_index = rb_event_index(event);
old_index = new_index + rb_event_length(event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
addr &= PAGE_MASK;
@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
static int
rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
u64 *ts, u64 *delta)
{
struct ring_buffer_event *event;
int ret;
WARN_ONCE(*delta > (1ULL << 59),
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
(unsigned long long)*delta,
(unsigned long long)*ts,
(unsigned long long)cpu_buffer->write_stamp);
/*
* The delta is too big, we to add a
* new timestamp.
*/
event = __rb_reserve_next(cpu_buffer,
RINGBUF_TYPE_TIME_EXTEND,
RB_LEN_TIME_EXTEND,
ts);
if (!event)
return -EBUSY;
if (PTR_ERR(event) == -EAGAIN)
return -EAGAIN;
/* Only a commited time event can update the write stamp */
if (rb_event_is_commit(cpu_buffer, event)) {
/*
* If this is the first on the page, then it was
* updated with the page itself. Try to discard it
* and if we can't just make it zero.
*/
if (rb_event_index(event)) {
event->time_delta = *delta & TS_MASK;
event->array[0] = *delta >> TS_SHIFT;
} else {
/* try to discard, since we do not need this */
if (!rb_try_to_discard(cpu_buffer, event)) {
/* nope, just zero it */
event->time_delta = 0;
event->array[0] = 0;
}
}
cpu_buffer->write_stamp = *ts;
/* let the caller know this was the commit */
ret = 1;
} else {
/* Try to discard the event */
if (!rb_try_to_discard(cpu_buffer, event)) {
/* Darn, this is just wasted space */
event->time_delta = 0;
event->array[0] = 0;
}
ret = 0;
}
*delta = 0;
return ret;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits);
}
static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
unsigned long length)
{
struct ring_buffer_event *event;
u64 ts, delta = 0;
int commit = 0;
u64 ts, delta;
int nr_loops = 0;
int add_timestamp;
u64 diff;
rb_start_commit(cpu_buffer);
@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
length = rb_calculate_event_length(length);
again:
add_timestamp = 0;
delta = 0;
/*
* We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop
@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
goto out_fail;
ts = rb_time_stamp(cpu_buffer->buffer);
diff = ts - cpu_buffer->write_stamp;
/*
* Only the first commit can update the timestamp.
* Yes there is a race here. If an interrupt comes in
* just after the conditional and it traces too, then it
* will also check the deltas. More than one timestamp may
* also be made. But only the entry that did the actual
* commit will be something other than zero.
*/
if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
rb_page_write(cpu_buffer->tail_page) ==
rb_commit_index(cpu_buffer))) {
u64 diff;
diff = ts - cpu_buffer->write_stamp;
/* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
goto get_event;
/* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
if (commit == -EBUSY)
goto out_fail;
if (commit == -EAGAIN)
goto again;
RB_WARN_ON(cpu_buffer, commit < 0);
WARN_ONCE(delta > (1ULL << 59),
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
(unsigned long long)delta,
(unsigned long long)ts,
(unsigned long long)cpu_buffer->write_stamp);
add_timestamp = 1;
}
}
get_event:
event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
event = __rb_reserve_next(cpu_buffer, length, ts,
delta, add_timestamp);
if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
if (!event)
goto out_fail;
if (!rb_event_is_commit(cpu_buffer, event))
delta = 0;
event->time_delta = delta;
return event;
out_fail:
@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#define TRACE_RECURSIVE_DEPTH 16
static int trace_recursive_lock(void)
/* Keep this code out of the fast path cache */
static noinline void trace_recursive_fail(void)
{
current->trace_recursion++;
if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
return 0;
/* Disable all tracing before we do anything else */
tracing_off_permanent();
@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
in_nmi());
WARN_ON_ONCE(1);
}
static inline int trace_recursive_lock(void)
{
current->trace_recursion++;
if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
return 0;
trace_recursive_fail();
return -1;
}
static void trace_recursive_unlock(void)
static inline void trace_recursive_unlock(void)
{
WARN_ON_ONCE(!current->trace_recursion);
@ -2308,12 +2298,28 @@ static void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
u64 delta;
/*
* The event first in the commit queue updates the
* time stamp.
*/
if (rb_event_is_commit(cpu_buffer, event))
cpu_buffer->write_stamp += event->time_delta;
if (rb_event_is_commit(cpu_buffer, event)) {
/*
* A commit event that is first on a page
* updates the write timestamp with the page stamp
*/
if (!rb_event_index(event))
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
delta = event->array[0];
delta <<= TS_SHIFT;
delta += event->time_delta;
cpu_buffer->write_stamp += delta;
} else
cpu_buffer->write_stamp += event->time_delta;
}
}
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static inline void rb_event_discard(struct ring_buffer_event *event)
{
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
again:
/*
* We repeat when a timestamp is encountered. It is possible
* to get multiple timestamps from an interrupt entering just
* as one timestamp is about to be written, or from discarded
* commits. The most that we can have is the number on a single page.
* We repeat when a time extend is encountered.
* Since the time extend is always attached to a data event,
* we should never loop more than once.
* (We never hit the following condition more than twice).
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
* We repeat when a timestamp is encountered.
* We can get multiple timestamps by nested interrupts or also
* if filtering is on (discarding commits). Since discarding
* commits can be frequent we can get a lot of timestamps.
* But we limit them by not adding timestamps if they begin
* at the start of a page.
* We repeat when a time extend is encountered.
* Since the time extend is always attached to a data event,
* we should never loop more than once.
* (We never hit the following condition more than twice).
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (len > (commit - read))
len = (commit - read);
size = rb_event_length(event);
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
if (len < size)
goto out_unlock;
@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
break;
event = rb_reader_event(cpu_buffer);
size = rb_event_length(event);
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
} while (len > size);
/* update bpage */