From 92dd102ec84c64bac6c30233b8b254119fd3be54 Mon Sep 17 00:00:00 2001 From: "shenping.matt" Date: Fri, 22 Dec 2023 10:24:28 +0800 Subject: [PATCH] Trace ringbuffer synced to latest 6.7 Panic fix for arm64 due to kernel data leaking (usercopy_hardened) Signed-off-by: shenping.matt --- driver/LKM/include/print_event.h | 2 +- driver/LKM/include/trace_buffer.h | 17 +- driver/LKM/src/trace_buffer.c | 612 ++++++++++++++++-------------- 3 files changed, 339 insertions(+), 292 deletions(-) diff --git a/driver/LKM/include/print_event.h b/driver/LKM/include/print_event.h index aa2a84089..da97f6ab6 100644 --- a/driver/LKM/include/print_event.h +++ b/driver/LKM/include/print_event.h @@ -181,7 +181,7 @@ __entry = tb_event_data(event); \ __entry->head.id = __class->id; \ { assign; } \ - tb_unlock_commit(__class->trace, event); \ + tb_unlock_commit(__class->trace); \ } \ \ static notrace void name##_print(proto) \ diff --git a/driver/LKM/include/trace_buffer.h b/driver/LKM/include/trace_buffer.h index afb08e968..0450c81e8 100644 --- a/driver/LKM/include/trace_buffer.h +++ b/driver/LKM/include/trace_buffer.h @@ -31,12 +31,15 @@ #define raw_spin_lock_init(x) ((x)->slock = 0) #define raw_spin_lock_irq __raw_spin_lock #define raw_spin_unlock_irq __raw_spin_unlock +/* + * Safe here to ignore irq flags since we only write ringbuffer in + * kprobe/tracepoint callbacks + */ #define raw_spin_lock_irqsave(l, f) do {(f) = 0; __raw_spin_lock(l);} while(0) #define raw_spin_unlock_irqrestore(l, f) do {__raw_spin_unlock(l);} while(0) #endif struct tb_ring; -struct tb_iter; /* * Don't refer to this struct directly, use functions below. @@ -111,6 +114,7 @@ __tb_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); __tb_alloc((size), (flags), &__key); \ }) +void tb_wake_up(struct tb_ring *ring); int tb_wait(struct tb_ring *ring, int cpu, int full); void tb_free(struct tb_ring *ring); @@ -129,8 +133,7 @@ u64 tb_event_timestamp(struct tb_ring *ring, struct tb_event *tb_lock_reserve(struct tb_ring *ring, unsigned long length); -int tb_unlock_commit(struct tb_ring *ring, - struct tb_event *event); +int tb_unlock_commit(struct tb_ring *ring); int tb_write(struct tb_ring *ring, unsigned long length, void *data); @@ -146,7 +149,7 @@ int tb_write(struct tb_ring *ring, * if (some_condition) * tb_discard_commit(buffer, event); * else - * tb_unlock_commit(buffer, event); + * tb_unlock_commit(buffer); */ void tb_discard_commit(struct tb_ring *ring, struct tb_event *event); @@ -180,12 +183,6 @@ size_t tb_nr_dirty_pages(struct tb_ring *ring, int cpu); void tb_change_overwrite(struct tb_ring *ring, int val); -struct tb_iter * -tb_read_prepare(struct tb_ring *ring, int cpu, gfp_t flags); -void tb_read_prepare_sync(void); -void tb_read_start(struct tb_iter *iter); -void tb_read_finish(struct tb_iter *iter); - u64 tb_time_stamp(struct tb_ring *ring); void tb_set_time_stamp_abs(struct tb_ring *ring, bool abs); bool tb_time_stamp_abs(struct tb_ring *ring); diff --git a/driver/LKM/src/trace_buffer.c b/driver/LKM/src/trace_buffer.c index f5b29ae6a..dfe93a7be 100644 --- a/driver/LKM/src/trace_buffer.c +++ b/driver/LKM/src/trace_buffer.c @@ -1,6 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Generic ring buffer + * Moidified from Generic ring buffer of kernel v6.7 + * 1) ring_buffer_iter stuff removed + * 2) ringbuffer resizing removed + * 3) poll waiting routines removed + * 4) teas case: test_ringbuffer removed * * Copyright (C) 2008 Steven Rostedt */ @@ -29,6 +33,14 @@ #define READ_ONCE(var) (*((volatile typeof(var) *)(&(var)))) #endif +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 35) /* sched_clock not export for kernels below 2.6.35 */ #define tb_sched_clock (((u64)jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ)) @@ -344,10 +356,11 @@ static void tb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/* - * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing - * this issue out. - */ +static __always_inline unsigned int tb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); @@ -359,9 +372,7 @@ static void free_buffer_page(struct buffer_page *bpage) */ static inline int test_time_stamp(u64 delta) { - if (delta & TS_DELTA_TEST) - return 1; - return 0; + return !!(delta & TS_DELTA_TEST); } #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) @@ -373,6 +384,7 @@ struct tb_irq_work { void (*wakeup)(struct tb_irq_work *); wait_queue_head_t waiters; wait_queue_head_t full_waiters; + long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -437,6 +449,7 @@ struct tb_time_struct { local_t cnt; local_t top; local_t bottom; + local_t msb; }; #else struct tb_time_struct { @@ -481,6 +494,7 @@ struct tb_per_cpu { local_t committing; local_t commits; local_t pages_touched; + local_t pages_lost; local_t pages_read; long last_pages_touch; size_t shortest_full; @@ -490,6 +504,8 @@ struct tb_per_cpu { tb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -535,6 +551,7 @@ struct tb_iter { struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; + unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct tb_event *event; @@ -551,7 +568,6 @@ struct tb_iter { * For the ring buffer, 64 bit required operations for the time is * the following: * - * - Only need 59 bits (uses 60 to make it even). * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. @@ -576,6 +592,7 @@ struct tb_iter { */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 static inline int tb_time_cnt(unsigned long val) { @@ -595,7 +612,7 @@ static inline u64 tb_time_val(unsigned long top, unsigned long bottom) static inline bool __tb_time_read(tb_time_t *t, u64 *ret, unsigned long *cnt) { - unsigned long top, bottom; + unsigned long top, bottom, msb; unsigned long c; /* @@ -607,15 +624,17 @@ static inline bool __tb_time_read(tb_time_t *t, u64 *ret, unsigned long *cnt) c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); + msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = tb_time_cnt(top); - /* If top and bottom counts don't match, this interrupted a write */ - if (*cnt != tb_time_cnt(bottom)) + /* If top, msb or bottom counts don't match, this interrupted a write */ + if (*cnt != tb_time_cnt(msb) || *cnt != tb_time_cnt(bottom) return false; - *ret = tb_time_val(top, bottom); + /* The shift to msb will lose its cnt bits */ + *ret = tb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } @@ -631,10 +650,12 @@ static inline unsigned long tb_time_val_cnt(unsigned long val, unsigned long cnt return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } -static inline void tb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +static inline void tb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void tb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) @@ -645,15 +666,16 @@ static inline void tb_time_val_set(local_t *t, unsigned long val, unsigned long static void tb_time_set(tb_time_t *t, u64 val) { - unsigned long cnt, top, bottom; + unsigned long cnt, top, bottom, msb; - tb_time_split(val, &top, &bottom); + tb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); tb_time_val_set(&t->top, top, cnt); tb_time_val_set(&t->bottom, bottom, cnt); + tb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } @@ -663,43 +685,7 @@ tb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) unsigned long ret; ret = local_cmpxchg(l, expect, set); - return ret == expect; -} - -static int tb_time_cmpxchg(tb_time_t *t, u64 expect, u64 set) -{ - unsigned long cnt, top, bottom; - unsigned long cnt2, top2, bottom2; - u64 val; - - /* The cmpxchg always fails if it interrupted an update */ - if (!__tb_time_read(t, &val, &cnt2)) - return false; - - if (val != expect) - return false; - - cnt = local_read(&t->cnt); - if ((cnt & 3) != cnt2) - return false; - - cnt2 = cnt + 1; - - tb_time_split(val, &top, &bottom); - top = tb_time_val_cnt(top, cnt); - bottom = tb_time_val_cnt(bottom, cnt); - - tb_time_split(set, &top2, &bottom2); - top2 = tb_time_val_cnt(top2, cnt2); - bottom2 = tb_time_val_cnt(bottom2, cnt2); - - if (!tb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) - return false; - if (!tb_time_read_cmpxchg(&t->top, top, top2)) - return false; - if (!tb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) - return false; - return true; + return ret == expect } #else /* 64 bits */ @@ -714,12 +700,6 @@ static void tb_time_set(tb_time_t *t, u64 val) local_set(&t->time, val); } -static bool tb_time_cmpxchg(tb_time_t *t, u64 expect, u64 set) -{ - u64 val; - val = local_cmpxchg(&t->time, expect, set); - return val == expect; -} #endif /* @@ -763,6 +743,25 @@ static inline void verify_event(struct tb_per_cpu *cpu_ring, } #endif +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The tb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 tb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} + /** * tb_event_timestamp - return the event's current time stamp * @buffer: The buffer that the event is on @@ -788,8 +787,10 @@ u64 tb_event_timestamp(struct tb_ring *buffer, u64 ts; /* If the event includes an absolute time, then just use that */ - if (event->type_len == TB_TYPETIME_STAMP) - return tb_event_time_stamp(event); + if (event->type_len == TB_TYPETIME_STAMP) { + ts = tb_event_time_stamp(event); + return tb_fix_abs_ts(ts, cpu_ring->tail_page->page->time_stamp); + } nest = local_read(&cpu_ring->committing); verify_event(cpu_ring, event); @@ -825,7 +826,7 @@ size_t tb_nr_pages(struct tb_ring *buffer, int cpu) } /** - * tb_nr_pages_dirty - get the number of used pages in the ring buffer + * tb_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * @@ -834,10 +835,18 @@ size_t tb_nr_pages(struct tb_ring *buffer, int cpu) size_t tb_nr_dirty_pages(struct tb_ring *buffer, int cpu) { size_t read; + size_t lost; size_t cnt; read = local_read(&buffer->buffers[cpu]->pages_read); + lost = local_read(&buffer->buffers[cpu]->pages_lost); cnt = local_read(&buffer->buffers[cpu]->pages_touched); + + if (WARN_ON_ONCE(cnt < lost)) + return 0; + + cnt -= lost; + /* The reader can read an empty page, but not more than that */ if (cnt < read) { WARN_ON_ONCE(read > cnt + 1); @@ -847,21 +856,84 @@ size_t tb_nr_dirty_pages(struct tb_ring *buffer, int cpu) return cnt - read; } +static __always_inline bool full_hit(struct tb_ring *buffer, int cpu, int full) +{ + struct tb_per_cpu *cpu_ring = buffer->buffers[cpu]; + size_t nr_pages; + size_t dirty; + + nr_pages = cpu_ring->nr_pages; + if (!nr_pages || !full) + return true; + + dirty = tb_nr_dirty_pages(buffer, cpu); + + return (dirty * 100) > (full * nr_pages); +} + /* * tb_wake_up_waiters - wake up tasks waiting for ring buffer input * * Schedules a delayed work to wake up any task that is blocked on the * ring buffer waiters queue. */ -void tb_wake_up_waiters(struct tb_irq_work *work) +static void tb_wake_up_waiters(struct tb_irq_work *work) { wake_up_all(&work->waiters); - if (work->wakeup_full) { + if (work->full_waiters_pending || work->wakeup_full) { work->wakeup_full = false; + work->full_waiters_pending = false; wake_up_all(&work->full_waiters); } } +/** + * tb_wake_waiters_cpu - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +static void tb_wake_waiters_cpu(struct tb_ring *buffer, int cpu) +{ + struct tb_per_cpu *cpu_ring; + struct tb_irq_work *rbwork; + + if (!buffer) + return; + + if (cpu == TB_RING_ALL_CPUS) { + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + tb_wake_waiters_cpu(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + if (WARN_ON_ONCE(!buffer->buffers)) + return; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + + cpu_ring = buffer->buffers[cpu]; + /* The CPU buffer may not have been initialized yet */ + if (!cpu_ring) + return; + rbwork = &cpu_ring->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + tb_wake_up_waiters(rbwork); +} + +void tb_wake_up(struct tb_ring *buffer) +{ + tb_wake_waiters_cpu(buffer, TB_RING_ALL_CPUS); +} + /** * tb_wait - wait for input to the ring buffer * @buffer: buffer to wait on @@ -877,6 +949,7 @@ int tb_wait(struct tb_ring *buffer, int cpu, int full) struct tb_per_cpu *cpu_ring = NULL; DEFINE_WAIT(wait); struct tb_irq_work *work; + long wait_index; int ret = 0; /* @@ -895,6 +968,8 @@ int tb_wait(struct tb_ring *buffer, int cpu, int full) work = &cpu_ring->irq_work; } + wait_index = READ_ONCE(work->wait_index); + while (true) { if (full) prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); @@ -934,30 +1009,32 @@ int tb_wait(struct tb_ring *buffer, int cpu, int full) if (cpu == TB_RING_ALL_CPUS && !tb_empty(buffer)) break; - if (cpu != TB_RING_ALL_CPUS && - !tb_empty_cpu(buffer, cpu)) { + if (cpu != TB_RING_ALL_CPUS && !tb_empty_cpu(buffer, cpu)) { unsigned long flags; bool pagebusy; - size_t nr_pages; - size_t dirty; + bool done; if (!full) break; raw_spin_lock_irqsave(&cpu_ring->reader_lock, flags); pagebusy = cpu_ring->reader_page == cpu_ring->commit_page; - nr_pages = cpu_ring->nr_pages; - dirty = tb_nr_dirty_pages(buffer, cpu); + done = !pagebusy && full_hit(buffer, cpu, full); + if (!cpu_ring->shortest_full || - cpu_ring->shortest_full < full) + cpu_ring->shortest_full > full) cpu_ring->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_ring->reader_lock, flags); - if (!pagebusy && - (!nr_pages || (dirty * 100) > full * nr_pages)) + if (done) break; } schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; } if (full) @@ -1270,8 +1347,8 @@ tb_set_head_page(struct tb_per_cpu *cpu_ring) return NULL; } -static int tb_head_page_replace(struct buffer_page *old, - struct buffer_page *new) +static bool tb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; @@ -1349,28 +1426,12 @@ static void tb_tail_page_update(struct tb_per_cpu *cpu_ring, } } -static int tb_check_bpage(struct tb_per_cpu *cpu_ring, +static void tb_check_bpage(struct tb_per_cpu *cpu_ring, struct buffer_page *bpage) { unsigned long val = (unsigned long)bpage; - if (RB_WARN_ON(cpu_ring, val & RB_FLAG_MASK)) - return 1; - - return 0; -} - -/** - * tb_check_list - make sure a pointer to a list has the last bits zero - */ -static int tb_check_list(struct tb_per_cpu *cpu_ring, - struct list_head *list) -{ - if (RB_WARN_ON(cpu_ring, tb_list_head(list->prev) != list->prev)) - return 1; - if (RB_WARN_ON(cpu_ring, tb_list_head(list->next) != list->next)) - return 1; - return 0; + RB_WARN_ON(cpu_ring, val & RB_FLAG_MASK); } /** @@ -1380,39 +1441,23 @@ static int tb_check_list(struct tb_per_cpu *cpu_ring, * As a safety measure we check to make sure the data pages have not * been corrupted. */ -static int tb_check_pages(struct tb_per_cpu *cpu_ring) +static void tb_check_pages(struct tb_per_cpu *cpu_ring) { - struct list_head *head = cpu_ring->pages; - struct buffer_page *bpage, *tmp; - - /* Reset the head page if it exists */ - if (cpu_ring->head_page) - tb_set_head_page(cpu_ring); + struct list_head *head = tb_list_head(cpu_ring->pages); + struct list_head *tmp; - tb_head_page_deactivate(cpu_ring); - - if (RB_WARN_ON(cpu_ring, head->next->prev != head)) - return -1; - if (RB_WARN_ON(cpu_ring, head->prev->next != head)) - return -1; + if (RB_WARN_ON(cpu_ring, tb_list_head(tb_list_head(head->next)->prev) != head)) + return; - if (tb_check_list(cpu_ring, head)) - return -1; + if (RB_WARN_ON(cpu_ring, tb_list_head(tb_list_head(head->prev)->next) != head)) + return; - list_for_each_entry_safe(bpage, tmp, head, list) { - if (RB_WARN_ON(cpu_ring, - bpage->list.next->prev != &bpage->list)) - return -1; - if (RB_WARN_ON(cpu_ring, - bpage->list.prev->next != &bpage->list)) - return -1; - if (tb_check_list(cpu_ring, &bpage->list)) - return -1; + for (tmp = tb_list_head(head->next); tmp != head; tmp = tb_list_head(tmp->next)) { + if (RB_WARN_ON(cpu_ring, tb_list_head(tb_list_head(tmp->next)->prev) != tmp)) + return; + if (RB_WARN_ON(cpu_ring, tb_list_head(tb_list_head(tmp->prev)->next) != tmp)) + return; } - - tb_head_page_activate(cpu_ring); - - return 0; } static int __tb_allocate_pages(struct tb_per_cpu *cpu_ring, @@ -1579,9 +1624,9 @@ static void tb_free_cpu_ring(struct tb_per_cpu *cpu_ring) free_buffer_page(cpu_ring->reader_page); - tb_head_page_deactivate(cpu_ring); - if (head) { + tb_head_page_deactivate(cpu_ring); + list_for_each_entry_safe(bpage, tmp, head, list) { list_del_init(&bpage->list); free_buffer_page(bpage); @@ -1590,6 +1635,7 @@ static void tb_free_cpu_ring(struct tb_per_cpu *cpu_ring) free_buffer_page(bpage); } + free_page((unsigned long)cpu_ring->free_page); kfree(cpu_ring); } @@ -1729,7 +1775,7 @@ static inline unsigned long tb_page_write(struct buffer_page *bpage) return local_read(&bpage->write) & RB_WRITE_MASK; } -static int +static bool tb_remove_pages(struct tb_per_cpu *cpu_ring, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; @@ -1771,6 +1817,9 @@ tb_remove_pages(struct tb_per_cpu *cpu_ring, unsigned long nr_pages) head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } + /* Read iterators need to reset themselves when some pages removed */ + cpu_ring->pages_removed += nr_removed; + next_page = tb_list_head(to_remove)->next; /* @@ -1791,12 +1840,6 @@ tb_remove_pages(struct tb_per_cpu *cpu_ring, unsigned long nr_pages) cpu_ring->head_page = list_entry(next_page, struct buffer_page, list); - /* - * change read pointer to make sure any read iterators reset - * themselves - */ - cpu_ring->read = 0; - /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_ring->record_disabled); raw_spin_unlock_irq(&cpu_ring->reader_lock); @@ -1824,7 +1867,8 @@ tb_remove_pages(struct tb_per_cpu *cpu_ring, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_ring->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_ring->entries_bytes); + local_sub(tb_page_commit(to_remove_page), &cpu_ring->entries_bytes); + local_inc(&cpu_ring->pages_lost); } /* @@ -1841,13 +1885,17 @@ tb_remove_pages(struct tb_per_cpu *cpu_ring, unsigned long nr_pages) return nr_removed == 0; } -static int +static bool tb_insert_pages(struct tb_per_cpu *cpu_ring) { struct list_head *pages = &cpu_ring->new_pages; - int retries, success; + unsigned long flags; + bool success; + int retries; + + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_ring->reader_lock, flags); - raw_spin_lock_irq(&cpu_ring->reader_lock); /* * We are holding the reader lock, so the reader page won't be swapped * in the ring buffer. Now we are racing with the writer trying to @@ -1863,15 +1911,16 @@ tb_insert_pages(struct tb_per_cpu *cpu_ring) * spinning. */ retries = 10; - success = 0; + success = false; while (retries--) { struct list_head *head_page, *prev_page, *r; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; + struct buffer_page *hpage = tb_set_head_page(cpu_ring); - head_page = &tb_set_head_page(cpu_ring)->list; - if (!head_page) + if (!hpage) break; + head_page = &hpage->list; prev_page = head_page->prev; first_page = pages->next; @@ -1892,7 +1941,7 @@ tb_insert_pages(struct tb_per_cpu *cpu_ring) * pointer to point to end of list */ head_page->prev = last_page; - success = 1; + success = true; break; } } @@ -1904,7 +1953,7 @@ tb_insert_pages(struct tb_per_cpu *cpu_ring) * tracing */ RB_WARN_ON(cpu_ring, !success); - raw_spin_unlock_irq(&cpu_ring->reader_lock); + raw_spin_unlock_irqrestore(&cpu_ring->reader_lock, flags); /* free pages if they weren't inserted */ if (!success) { @@ -1920,7 +1969,7 @@ tb_insert_pages(struct tb_per_cpu *cpu_ring) static void tb_update_pages(struct tb_per_cpu *cpu_ring) { - int success; + bool success; if (cpu_ring->nr_pages_to_update > 0) success = tb_insert_pages(cpu_ring); @@ -1962,11 +2011,6 @@ tb_reader_event(struct tb_per_cpu *cpu_ring) cpu_ring->reader_page->read); } -static __always_inline unsigned tb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - /* Size is determined by what has been committed */ static __always_inline unsigned tb_page_size(struct buffer_page *bpage) { @@ -1987,14 +2031,15 @@ tb_event_index(struct tb_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static __inline void tb_overrun_page(struct tb_per_cpu *cpu_ring, - struct buffer_page *bpage) +static __inline void +tb_overrun_page(struct tb_per_cpu *cpu_ring, struct buffer_page *bpage) { long entries, size; entries = tb_page_entries(bpage); local_add(entries, &cpu_ring->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_ring->entries_bytes); + local_sub(tb_page_commit(bpage), &cpu_ring->entries_bytes); + local_inc(&cpu_ring->pages_lost); size = BUF_PAGE_SIZE - entries * sizeof(u32); if (size > 0) @@ -2185,9 +2230,6 @@ tb_reset_tail(struct tb_per_cpu *cpu_ring, event = __tb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_ring->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2201,7 +2243,8 @@ tb_reset_tail(struct tb_per_cpu *cpu_ring, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes' * * If we are less than the minimum size, we don't need to * worry about it. @@ -2212,6 +2255,9 @@ tb_reset_tail(struct tb_per_cpu *cpu_ring, /* Mark the rest of the page with padding */ tb_event_set_padding(event); + /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2223,6 +2269,12 @@ tb_reset_tail(struct tb_per_cpu *cpu_ring, /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_ring->entries_bytes); + + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -2383,8 +2435,17 @@ static void tb_add_timestamp(struct tb_per_cpu *cpu_ring, (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { + + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + + info->delta &= ABS_TS_MASK; + /* did the clock go backwards */ - if (info->before == info->after && info->before > info->ts) { + } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; @@ -2478,26 +2539,7 @@ static unsigned tb_calculate_event_length(unsigned length) return length; } -static u64 tb_time_delta(struct tb_event *event) -{ - switch (event->type_len) { - case TB_TYPEPADDING: - return 0; - - case TB_TYPETIME_EXTEND: - return tb_event_time_stamp(event); - - case TB_TYPETIME_STAMP: - return 0; - - case TB_TYPEDATA: - return event->time_delta; - default: - return 0; - } -} - -static inline int +static inline bool tb_try_to_discard(struct tb_per_cpu *cpu_ring, struct tb_event *event) { @@ -2505,8 +2547,6 @@ tb_try_to_discard(struct tb_per_cpu *cpu_ring, struct buffer_page *bpage; unsigned long index; unsigned long addr; - u64 write_stamp; - u64 delta; new_index = tb_event_index(event); old_index = new_index + tb_event_ts_length(event); @@ -2515,41 +2555,34 @@ tb_try_to_discard(struct tb_per_cpu *cpu_ring, bpage = READ_ONCE(cpu_ring->tail_page); - delta = tb_time_delta(event); - - if (!tb_time_read(&cpu_ring->write_stamp, &write_stamp)) - return 0; - - /* Make sure the write stamp is read before testing the location */ - barrier(); - + /* + * Make sure the tail_page is still the same and + * the next write location is the end of this event + */ if (bpage->page == (void *)addr && tb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = tb_event_length(event); - /* Something came in, can't discard */ - if (!tb_time_cmpxchg(&cpu_ring->write_stamp, - write_stamp, write_stamp - delta)) - return 0; - /* - * It's possible that the event time delta is zero - * (has the same time stamp as the previous event) - * in which case write_stamp and before_stamp could - * be the same. In such a case, force before_stamp - * to be different than write_stamp. It doesn't - * matter what it is, as long as its different. + * For the before_stamp to be different than the write_stamp + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. + * + * By setting the before_stamp to zero, the next event + * is not going to use the write_stamp and will instead + * create an absolute timestamp. This means there's no + * reason to update the wirte_stamp! */ - if (!delta) - tb_time_set(&cpu_ring->before_stamp, 0); + tb_time_set(&cpu_ring->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the - * write stamp for us, and use the before stamp as its delta. + * write stamp for us, and use an absolute timestamp. */ /* @@ -2564,12 +2597,12 @@ tb_try_to_discard(struct tb_per_cpu *cpu_ring, if (index == old_index) { /* update counters */ local_sub(event_length, &cpu_ring->entries_bytes); - return 1; + return true; } } /* could not discard */ - return 0; + return false; } static void tb_start_commit(struct tb_per_cpu *cpu_ring) @@ -2600,6 +2633,10 @@ tb_set_commit_to_write(struct tb_per_cpu *cpu_ring) if (RB_WARN_ON(cpu_ring, tb_is_reader_page(cpu_ring->tail_page))) return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ local_set(&cpu_ring->commit_page->page->commit, tb_page_write(cpu_ring->commit_page)); tb_inc_page(&cpu_ring->commit_page); @@ -2608,7 +2645,8 @@ tb_set_commit_to_write(struct tb_per_cpu *cpu_ring) } while (tb_commit_index(cpu_ring) != tb_page_write(cpu_ring->commit_page)) { - + /* Make sure the readers see the content of what is committed. */ + smp_wmb(); local_set(&cpu_ring->commit_page->page->commit, tb_page_write(cpu_ring->commit_page)); RB_WARN_ON(cpu_ring, @@ -2674,8 +2712,7 @@ static inline void tb_event_discard(struct tb_event *event) event->time_delta = 1; } -static void tb_commit(struct tb_per_cpu *cpu_ring, - struct tb_event *event) +static void tb_commit(struct tb_per_cpu *cpu_ring) { local_inc(&cpu_ring->entries); tb_end_commit(cpu_ring); @@ -2684,10 +2721,6 @@ static void tb_commit(struct tb_per_cpu *cpu_ring, static __always_inline void tb_wakeups(struct tb_ring *buffer, struct tb_per_cpu *cpu_ring) { - size_t nr_pages; - size_t dirty; - size_t full; - if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ @@ -2711,10 +2744,7 @@ tb_wakeups(struct tb_ring *buffer, struct tb_per_cpu *cpu_ring) cpu_ring->last_pages_touch = local_read(&cpu_ring->pages_touched); - full = cpu_ring->shortest_full; - nr_pages = cpu_ring->nr_pages; - dirty = tb_nr_dirty_pages(buffer, cpu_ring->cpu); - if (full && nr_pages && (dirty * 100) <= full * nr_pages) + if (!full_hit(buffer, cpu_ring->cpu, cpu_ring->shortest_full)) return; cpu_ring->irq_work.wakeup_full = true; @@ -2792,12 +2822,12 @@ tb_wakeups(struct tb_ring *buffer, struct tb_per_cpu *cpu_ring) * Note: The TRANSITION bit only handles a single transition between context. */ -static __always_inline int +static __always_inline bool tb_recursive_lock(struct tb_per_cpu *cpu_ring) { unsigned int val = cpu_ring->current_context; unsigned long pc = preempt_count(); - int bit; + int bit; /* TODO: interrupt_context_level */ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) bit = RB_CTX_NORMAL; @@ -2814,14 +2844,14 @@ tb_recursive_lock(struct tb_per_cpu *cpu_ring) bit = RB_CTX_TRANSITION; if (val & (1 << (bit + cpu_ring->nest))) { do_tb_record_recursion(); - return 1; + return true; } } val |= (1 << (bit + cpu_ring->nest)); cpu_ring->current_context = val; - return 0; + return false; } static __always_inline void @@ -2883,21 +2913,19 @@ void tb_nest_end(struct tb_ring *buffer) /** * tb_unlock_commit - commit a reserved * @buffer: The buffer to commit to - * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with tb_lock_reserve. */ -int tb_unlock_commit(struct tb_ring *buffer, - struct tb_event *event) +int tb_unlock_commit(struct tb_ring *buffer) { struct tb_per_cpu *cpu_ring; int cpu = raw_smp_processor_id(); cpu_ring = buffer->buffers[cpu]; - tb_commit(cpu_ring, event); + tb_commit(cpu_ring); tb_wakeups(buffer, cpu_ring); @@ -2937,7 +2965,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case TB_TYPETIME_STAMP: delta = tb_event_time_stamp(event); - ts = delta; + ts = tb_fix_abs_ts(delta, ts); pr_info(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -2993,7 +3021,7 @@ static void check_buffer(struct tb_per_cpu *cpu_ring, return; /* - * If this interrupted another event, + * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; @@ -3013,7 +3041,7 @@ static void check_buffer(struct tb_per_cpu *cpu_ring, case TB_TYPETIME_STAMP: delta = tb_event_time_stamp(event); - ts = delta; + ts = tb_fix_abs_ts(delta, ts); break; case TB_TYPEPADDING: @@ -3087,7 +3115,10 @@ __tb_reserve_next(struct tb_per_cpu *cpu_ring, * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + if (!w) { + /* Use the sub-buffer timestamp */ + info->delta = 0; + } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3110,26 +3141,20 @@ __tb_reserve_next(struct tb_per_cpu *cpu_ring, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - /* before and after may now different, fix it up*/ - b_ok = tb_time_read(&cpu_ring->before_stamp, &info->before); - a_ok = tb_time_read(&cpu_ring->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)tb_time_cmpxchg(&cpu_ring->before_stamp, - info->before, info->after); - if (a_ok && b_ok) - check_buffer(cpu_ring, info, CHECK_FULL_PAGE); + check_buffer(cpu_ring, info, CHECK_FULL_PAGE); return tb_move_tail(cpu_ring, tail, info); } if (likely(tail == w)) { - u64 save_before; - bool s_ok; /* Nothing interrupted us between A and C */ /*D*/ tb_time_set(&cpu_ring->write_stamp, info->ts); - barrier(); - /*E*/ s_ok = tb_time_read(&cpu_ring->before_stamp, &save_before); - RB_WARN_ON(cpu_ring, !s_ok); + /* + * If something came in between C and D, the write stamp + * may now not be in sync. But that's fine as the before_stamp + * will be different and then next event will just be forced + * to use an absolute timestamp. + */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -3137,41 +3162,40 @@ __tb_reserve_next(struct tb_per_cpu *cpu_ring, else /* Just use full timestamp for interrupting event */ info->delta = info->ts; - barrier(); check_buffer(cpu_ring, info, tail); - if (unlikely(info->ts != save_before)) { - /* SLOW PATH - Interrupted between C and E */ - - a_ok = tb_time_read(&cpu_ring->write_stamp, &info->after); - RB_WARN_ON(cpu_ring, !a_ok); - - /* Write stamp must only go forward */ - if (save_before > info->after) { - /* - * We do not care about the result, only that - * it gets updated atomically. - */ - (void)tb_time_cmpxchg(&cpu_ring->write_stamp, - info->after, save_before); - } - } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = tb_time_read(&cpu_ring->write_stamp, &info->after); + + /* Save the old before_stamp */ + a_ok = tb_time_read(&cpu_ring->before_stamp, &info->before); + RB_WARN_ON(cpu_ring, !a_ok); + + /* + * Read a new timestamp and update the before_stamp to make + * the next event after this one force using an absolute + * timestamp. This is in case an interrupt were to come in + * between E and F. + */ + ts = tb_time_stamp(cpu_ring->buffer); + tb_time_set(&cpu_ring->before_stamp, ts); + + barrier(); + /*E*/ a_ok = tb_time_read(&cpu_ring->write_stamp, &info->after); /* Was interrupted before here, write_stamp must be valid */ RB_WARN_ON(cpu_ring, !a_ok); - ts = tb_clock_local(); barrier(); - /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts && - tb_time_cmpxchg(&cpu_ring->write_stamp, - info->after, ts)) { - /* Nothing came after this event between C and E */ + /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after == info->before && info->after < ts) { + /* + * Nothing came after this event between C and F, it is + * safe to use info->after for the delta as it + * matched info->before and is still valid. + */ info->delta = ts - info->after; } else { /* - * Interrupted between C and E: + * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that @@ -3222,23 +3246,14 @@ tb_reserve_next_event(struct tb_ring *buffer, int nr_loops = 0; int add_ts_default; - tb_start_commit(cpu_ring); - /* The commit page can not change after this */ - -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - /* - * Due to the ability to swap a cpu buffer from a buffer - * it is possible it was swapped before we committed. - * (committing stops a swap). We check for it here and - * if it happened, we have to fail the write. - */ - barrier(); - if (unlikely(READ_ONCE(cpu_ring->buffer) != buffer)) { - local_dec(&cpu_ring->committing); - local_dec(&cpu_ring->commits); + /* ring buffer does cmpxchg, make sure it is safe in NMI context */ + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + (unlikely(in_nmi()))) { return NULL; } -#endif + + tb_start_commit(cpu_ring); + /* The commit page can not change after this */ info.data = length; info.length = tb_calculate_event_length(length); @@ -3246,6 +3261,8 @@ tb_reserve_next_event(struct tb_ring *buffer, if (tb_time_stamp_abs(cpu_ring->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; + if (info.length > BUF_MAX_DATA_SIZE) + goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } @@ -3498,7 +3515,7 @@ int tb_write(struct tb_ring *buffer, memcpy(body, data, length); - tb_commit(cpu_ring, event); + tb_commit(cpu_ring); tb_wakeups(buffer, cpu_ring); @@ -3736,7 +3753,7 @@ u64 tb_oldest_event_ts(struct tb_ring *buffer, int cpu) } /** - * tb_bytes_cpu - get the number of bytes consumed in a cpu buffer + * tb_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -3912,6 +3929,7 @@ tb_update_read_stamp(struct tb_per_cpu *cpu_ring, case TB_TYPETIME_STAMP: delta = tb_event_time_stamp(event); + delta = tb_fix_abs_ts(delta, cpu_ring->read_stamp); cpu_ring->read_stamp = delta; return; @@ -3932,7 +3950,7 @@ tb_get_reader_page(struct tb_per_cpu *cpu_ring) unsigned long overwrite; unsigned long flags; int nr_loops = 0; - int ret; + bool ret; local_irq_save(flags); arch_spin_lock(&cpu_ring->lock); @@ -4057,6 +4075,37 @@ tb_get_reader_page(struct tb_per_cpu *cpu_ring) arch_spin_unlock(&cpu_ring->lock); local_irq_restore(flags); + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || tb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_ring, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see tb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see tb_set_commit_to_write()). + */ + smp_rmb(); + return reader; } @@ -4081,6 +4130,7 @@ static void tb_advance_reader(struct tb_per_cpu *cpu_ring) length = tb_event_length(event); cpu_ring->reader_page->read += length; + cpu_ring->read_bytes += length; } int tb_lost_events(struct tb_per_cpu *cpu_ring) @@ -4136,6 +4186,7 @@ tb_buffer_peek(struct tb_per_cpu *cpu_ring, u64 *ts, case TB_TYPETIME_STAMP: if (ts) { *ts = tb_event_time_stamp(event); + *ts = tb_fix_abs_ts(*ts, reader->page->time_stamp); tb_normalize_time_stamp(cpu_ring->buffer, cpu_ring->cpu, ts); } @@ -4189,7 +4240,6 @@ tb_reader_unlock(struct tb_per_cpu *cpu_ring, bool locked) { if (likely(locked)) raw_spin_unlock(&cpu_ring->reader_lock); - return; } /** @@ -4309,7 +4359,7 @@ bool tb_empty_cpu(struct tb_ring *buffer, int cpu) struct tb_per_cpu *cpu_ring; unsigned long flags; bool dolock; - int ret; + bool ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return true; @@ -4325,7 +4375,7 @@ bool tb_empty_cpu(struct tb_ring *buffer, int cpu) } /** - * rind_buffer_empty - is the ring buffer empty? + * tb_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool tb_empty(struct tb_ring *ring)