diff --git a/deps/toptalk/Makefile b/deps/toptalk/Makefile index 53d99bc..d2f7da2 100644 --- a/deps/toptalk/Makefile +++ b/deps/toptalk/Makefile @@ -6,6 +6,12 @@ TEST_WINDOW = test-tcp-window TEST_VIDEO = test-video-detect TEST_RTSP = test-rtsp-tap +# Benchmark targets +BENCH_DECODE = bench-decode +BENCH_MALLOC = bench-malloc +BENCH_ROTATION = bench-rotation +BENCH_SORT = bench-sort + SRC = \ decode.c \ intervals.c \ @@ -25,7 +31,8 @@ HEADERS = \ tcp_rtt.h \ tcp_window.h \ video_detect.h \ - video_metrics.h + video_metrics.h \ + bench_common.h ifndef INTERVAL_COUNT INTERVAL_COUNT = 8 @@ -109,6 +116,58 @@ $(TEST_RTSP): $(LIB) test_rtsp_tap.c @echo Building $(TEST_RTSP) $(CC) -o $(TEST_RTSP) test_rtsp_tap.c $(LIB) $(LDLIBS) $(LDFLAGS) $(CFLAGS) +# Benchmark targets - built without sanitizers for accurate timing +# These need a clean library build without ASAN +BENCH_CFLAGS := -g -O2 -Wall -pedantic -std=c11 $(DEFINES) -fPIC -fno-omit-frame-pointer +BENCH_LDFLAGS := -lrt -lpthread $(PKGCONFIG_PCAP) $(PKGCONFIG_CURSES) + +# Library built without sanitizers for benchmarks +BENCH_LIB = toptalk-bench.a + +$(BENCH_LIB): $(SRC) $(HEADERS) Makefile + @echo Building $(BENCH_LIB) without sanitizers + $(CC) -c $(SRC) $(BENCH_CFLAGS) + gcc-ar cr $(BENCH_LIB) *.o + @echo -e "$(BENCH_LIB) OK\n" + +$(BENCH_DECODE): $(BENCH_LIB) bench_decode.c bench_common.c bench_common.h + @echo Building $(BENCH_DECODE) + $(CC) -o $(BENCH_DECODE) bench_decode.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS) + +$(BENCH_MALLOC): $(BENCH_LIB) bench_malloc.c bench_common.c bench_common.h + @echo Building $(BENCH_MALLOC) + $(CC) -o $(BENCH_MALLOC) bench_malloc.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS) + +$(BENCH_ROTATION): $(BENCH_LIB) bench_rotation.c bench_common.c bench_common.h + @echo Building $(BENCH_ROTATION) + $(CC) -o $(BENCH_ROTATION) bench_rotation.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS) + +$(BENCH_SORT): $(BENCH_LIB) bench_sort.c bench_common.c bench_common.h + @echo Building $(BENCH_SORT) + $(CC) -o $(BENCH_SORT) bench_sort.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS) + +BENCH_REGRESSION = bench-regression + +$(BENCH_REGRESSION): $(BENCH_LIB) bench_regression.c bench_common.c bench_common.h + @echo Building $(BENCH_REGRESSION) + $(CC) -o $(BENCH_REGRESSION) bench_regression.c bench_common.c timeywimey.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS) + +.PHONY: bench +bench: $(BENCH_DECODE) $(BENCH_MALLOC) $(BENCH_ROTATION) $(BENCH_SORT) + @echo "Running decode benchmark..." + @./$(BENCH_DECODE) + @echo "Running malloc benchmark..." + @./$(BENCH_MALLOC) + @echo "Running rotation benchmark..." + @./$(BENCH_ROTATION) + @echo "Running sort benchmark..." + @./$(BENCH_SORT) + +.PHONY: bench-test +bench-test: $(BENCH_REGRESSION) + @echo "Running performance regression tests..." + @./$(BENCH_REGRESSION) + .PHONY: test test: $(TEST) $(TEST_RTT) $(TEST_WINDOW) $(TEST_VIDEO) $(TEST_RTSP) @echo "Running RTT unit tests (no root required)..." @@ -133,4 +192,5 @@ clang-analyze: clean .PHONY: clean clean: rm $(LIB) $(PROG) $(TEST) $(TEST_RTT) $(TEST_WINDOW) $(TEST_VIDEO) $(TEST_RTSP) *.o *.a || true + rm $(BENCH_DECODE) $(BENCH_MALLOC) $(BENCH_ROTATION) $(BENCH_SORT) $(BENCH_REGRESSION) || true rm *.gcno *.gcov *.gcda || true diff --git a/deps/toptalk/bench_common.c b/deps/toptalk/bench_common.c new file mode 100644 index 0000000..fcc6fcb --- /dev/null +++ b/deps/toptalk/bench_common.c @@ -0,0 +1,90 @@ +/* + * bench_common.c - Benchmark utilities implementation + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "bench_common.h" + +/* Get CPU frequency from /proc/cpuinfo (Linux) */ +uint64_t bench_get_cpu_freq(void) +{ + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) { + /* Fallback: assume 3 GHz */ + return 3000000000ULL; + } + + char line[256]; + double mhz = 0.0; + + while (fgets(line, sizeof(line), f)) { + if (strncmp(line, "cpu MHz", 7) == 0) { + char *colon = strchr(line, ':'); + if (colon) { + mhz = atof(colon + 1); + break; + } + } + } + fclose(f); + + if (mhz < 100.0) { + /* Fallback: assume 3 GHz */ + return 3000000000ULL; + } + + return (uint64_t)(mhz * 1e6); +} + +void bench_run(const char *name, + void (*fn)(void *arg), + void *arg, + uint64_t iterations, + struct bench_result *result) +{ + uint64_t freq = bench_get_cpu_freq(); + uint64_t start, end, total = 0; + + /* Warm up - run a few iterations to prime caches */ + for (uint64_t i = 0; i < 100 && i < iterations; i++) { + fn(arg); + } + + /* Timed run */ + start = bench_start(); + for (uint64_t i = 0; i < iterations; i++) { + fn(arg); + } + end = bench_end(); + total = bench_cycles(start, end); + + /* Fill in results */ + result->name = name; + result->iterations = iterations; + result->total_cycles = total; + result->cycles_per_op = (double)total / (double)iterations; + result->ns_per_op = bench_cycles_to_ns(total, freq) / (double)iterations; +} + +void bench_report_header(void) +{ + printf("\n%-40s %12s %12s %12s\n", + "Benchmark", "Iterations", "Cycles/op", "ns/op"); + printf("%-40s %12s %12s %12s\n", + "----------------------------------------", + "------------", "------------", "------------"); +} + +void bench_report(const struct bench_result *result) +{ + printf("%-40s %12lu %12.1f %12.1f\n", + result->name, + result->iterations, + result->cycles_per_op, + result->ns_per_op); +} diff --git a/deps/toptalk/bench_common.h b/deps/toptalk/bench_common.h new file mode 100644 index 0000000..ec8d6c1 --- /dev/null +++ b/deps/toptalk/bench_common.h @@ -0,0 +1,122 @@ +/* + * bench_common.h - Benchmark utilities for performance measurement + * + * Provides: + * - Cycle-accurate timing using rdtsc (x86_64) + * - Wall-clock timing utilities + * - Benchmark runner and reporting + */ + +#ifndef BENCH_COMMON_H +#define BENCH_COMMON_H + +#include +#include + +/* Benchmark result structure */ +struct bench_result { + const char *name; + uint64_t iterations; + uint64_t total_cycles; + double cycles_per_op; + double ns_per_op; +}; + +/* + * Read CPU timestamp counter (x86_64). + * Returns current cycle count. Use bench_cycles() to compute elapsed. + */ +static inline uint64_t bench_start(void) +{ + uint32_t lo, hi; + /* Serialize to ensure timing is accurate */ + __asm__ volatile ( + "cpuid\n\t" + "rdtsc\n\t" + : "=a" (lo), "=d" (hi) + : "a" (0) + : "rbx", "rcx" + ); + return ((uint64_t)hi << 32) | lo; +} + +/* + * Read timestamp counter at end of measurement. + * Uses rdtscp for better serialization on modern CPUs. + */ +static inline uint64_t bench_end(void) +{ + uint32_t lo, hi; + __asm__ volatile ( + "rdtscp\n\t" + "mov %%eax, %0\n\t" + "mov %%edx, %1\n\t" + "cpuid\n\t" + : "=r" (lo), "=r" (hi) + : + : "rax", "rbx", "rcx", "rdx" + ); + return ((uint64_t)hi << 32) | lo; +} + +/* + * Compute elapsed cycles between start and end. + */ +static inline uint64_t bench_cycles(uint64_t start, uint64_t end) +{ + return end - start; +} + +/* + * Get approximate CPU frequency in Hz. + * Uses /proc/cpuinfo on Linux. + */ +uint64_t bench_get_cpu_freq(void); + +/* + * Convert cycles to nanoseconds given CPU frequency. + */ +static inline double bench_cycles_to_ns(uint64_t cycles, uint64_t freq_hz) +{ + return (double)cycles * 1e9 / (double)freq_hz; +} + +/* + * Run a benchmark function multiple times and collect statistics. + * + * name: Benchmark name for reporting + * fn: Function to benchmark (called with arg) + * arg: Argument passed to fn + * iterations: Number of times to call fn + * result: Output benchmark results + */ +void bench_run(const char *name, + void (*fn)(void *arg), + void *arg, + uint64_t iterations, + struct bench_result *result); + +/* + * Print benchmark results in a formatted table. + */ +void bench_report(const struct bench_result *result); + +/* + * Print header for benchmark report table. + */ +void bench_report_header(void); + +/* + * Prevent compiler from optimizing away a value. + * Use to ensure benchmark results are "used". + */ +#define BENCH_DONT_OPTIMIZE(val) \ + __asm__ volatile ("" : : "r,m" (val) : "memory") + +/* + * Memory barrier to prevent reordering. + */ +#define BENCH_BARRIER() \ + __asm__ volatile ("" ::: "memory") + +#endif /* BENCH_COMMON_H */ diff --git a/deps/toptalk/bench_decode.c b/deps/toptalk/bench_decode.c new file mode 100644 index 0000000..0742006 --- /dev/null +++ b/deps/toptalk/bench_decode.c @@ -0,0 +1,199 @@ +/* + * bench_decode.c - Benchmark for header parsing overhead + * + * Measures the cost of: + * 1. Single decode pass (decode_ethernet only) + * 2. Decode + find_tcp_header (current behavior) + * 3. Decode with stored offset (proposed optimization) + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "bench_common.h" +#include "flow.h" +#include "decode.h" + +/* Synthetic TCP packet: Ethernet + IPv4 + TCP + payload */ +static uint8_t test_packet[128]; +static struct pcap_pkthdr test_pkthdr; + +/* Pre-computed L4 offset for the optimized path */ +static uint16_t precomputed_l4_offset; + +/* Build a synthetic TCP packet for benchmarking */ +static void build_test_packet(void) +{ + memset(test_packet, 0, sizeof(test_packet)); + + /* Ethernet header (14 bytes) */ + uint8_t *eth = test_packet; + eth[12] = 0x08; /* EtherType: IPv4 (0x0800) */ + eth[13] = 0x00; + + /* IPv4 header (20 bytes) at offset 14 */ + uint8_t *ip4 = test_packet + 14; + ip4[0] = 0x45; /* Version 4, IHL 5 (20 bytes) */ + ip4[1] = 0x00; /* DSCP/ECN */ + ip4[2] = 0x00; /* Total length: 66 (20 IP + 20 TCP + 26 payload) */ + ip4[3] = 0x42; + ip4[8] = 0x40; /* TTL */ + ip4[9] = 0x06; /* Protocol: TCP */ + /* Source IP: 10.0.0.1 */ + ip4[12] = 10; ip4[13] = 0; ip4[14] = 0; ip4[15] = 1; + /* Dest IP: 10.0.0.2 */ + ip4[16] = 10; ip4[17] = 0; ip4[18] = 0; ip4[19] = 2; + + /* TCP header (20 bytes) at offset 34 */ + uint8_t *tcp = test_packet + 34; + tcp[0] = 0x04; /* Source port: 1234 */ + tcp[1] = 0xD2; + tcp[2] = 0x00; /* Dest port: 80 */ + tcp[3] = 0x50; + /* Sequence number */ + tcp[4] = 0x00; tcp[5] = 0x00; tcp[6] = 0x10; tcp[7] = 0x00; + /* Ack number */ + tcp[8] = 0x00; tcp[9] = 0x00; tcp[10] = 0x20; tcp[11] = 0x00; + tcp[12] = 0x50; /* Data offset: 5 (20 bytes), flags: 0 */ + tcp[13] = 0x10; /* ACK flag */ + tcp[14] = 0xFF; /* Window: 65535 */ + tcp[15] = 0xFF; + + /* Payload starts at offset 54 */ + memset(test_packet + 54, 'A', 26); + + /* Pcap header */ + test_pkthdr.ts.tv_sec = 0; + test_pkthdr.ts.tv_usec = 0; + test_pkthdr.caplen = 80; + test_pkthdr.len = 80; + + /* Pre-compute L4 offset: Ethernet (14) + IPv4 (20) = 34 */ + precomputed_l4_offset = 34; +} + +/* Benchmark 1: decode_ethernet only (baseline) */ +static void bench_decode_only(void *arg) +{ + (void)arg; + struct flow_pkt pkt; + char errbuf[256]; + + int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf); + BENCH_DONT_OPTIMIZE(ret); + BENCH_DONT_OPTIMIZE(pkt.flow_rec.flow.proto); +} + +/* Forward declaration - find_tcp_header is static in intervals.c, + * so we simulate its work here */ +static const struct hdr_tcp *find_tcp_header_sim(const uint8_t *packet, + uint32_t caplen) +{ + /* Simulate the header traversal that find_tcp_header does */ + if (caplen < 14) + return NULL; + + /* Check ethertype */ + uint16_t ethertype = (packet[12] << 8) | packet[13]; + const uint8_t *ip_start; + + if (ethertype == 0x8100) { + /* VLAN - skip 4 more bytes */ + ethertype = (packet[16] << 8) | packet[17]; + ip_start = packet + 18; + caplen -= 18; + } else { + ip_start = packet + 14; + caplen -= 14; + } + + if (ethertype != 0x0800) /* Not IPv4 for simplicity */ + return NULL; + + if (caplen < 20) + return NULL; + + /* IPv4 header length */ + uint8_t ihl = (ip_start[0] & 0x0F) * 4; + if (caplen < ihl + 20) + return NULL; + + /* Check protocol is TCP */ + if (ip_start[9] != 6) + return NULL; + + return (const struct hdr_tcp *)(ip_start + ihl); +} + +/* Benchmark 2: decode_ethernet + find_tcp_header (current behavior) */ +static void bench_decode_plus_find(void *arg) +{ + (void)arg; + struct flow_pkt pkt; + char errbuf[256]; + + int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf); + BENCH_DONT_OPTIMIZE(ret); + + /* Simulate what handle_packet does - re-parse to find TCP header */ + const struct hdr_tcp *tcp = find_tcp_header_sim(test_packet, + test_pkthdr.caplen); + BENCH_DONT_OPTIMIZE(tcp); +} + +/* Benchmark 3: decode with stored offset (proposed optimization) */ +static void bench_decode_with_offset(void *arg) +{ + (void)arg; + struct flow_pkt pkt; + char errbuf[256]; + + int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf); + BENCH_DONT_OPTIMIZE(ret); + + /* Use pre-computed offset - O(1) pointer arithmetic */ + const struct hdr_tcp *tcp = (const struct hdr_tcp *)(test_packet + precomputed_l4_offset); + BENCH_DONT_OPTIMIZE(tcp); +} + +int main(void) +{ + struct bench_result r1, r2, r3; + const uint64_t iterations = 100000; + + printf("\n=== Header Parsing Benchmark ===\n"); + printf("Packet: Ethernet + IPv4 + TCP (80 bytes)\n"); + + build_test_packet(); + + bench_report_header(); + + bench_run("decode_ethernet (baseline)", bench_decode_only, NULL, + iterations, &r1); + bench_report(&r1); + + bench_run("decode + find_tcp_header (current)", bench_decode_plus_find, NULL, + iterations, &r2); + bench_report(&r2); + + bench_run("decode + stored offset (proposed)", bench_decode_with_offset, NULL, + iterations, &r3); + bench_report(&r3); + + printf("\n--- Analysis ---\n"); + printf("find_tcp_header overhead: %.1f cycles (%.1f ns)\n", + r2.cycles_per_op - r1.cycles_per_op, + r2.ns_per_op - r1.ns_per_op); + printf("Stored offset overhead: %.1f cycles (%.1f ns)\n", + r3.cycles_per_op - r1.cycles_per_op, + r3.ns_per_op - r1.ns_per_op); + printf("Savings from optimization: %.1f cycles (%.1f%%)\n", + r2.cycles_per_op - r3.cycles_per_op, + 100.0 * (r2.cycles_per_op - r3.cycles_per_op) / r2.cycles_per_op); + + return 0; +} diff --git a/deps/toptalk/bench_malloc.c b/deps/toptalk/bench_malloc.c new file mode 100644 index 0000000..fa062a1 --- /dev/null +++ b/deps/toptalk/bench_malloc.c @@ -0,0 +1,123 @@ +/* + * bench_malloc.c - Benchmark for per-packet allocation overhead + * + * Measures the cost of: + * 1. malloc + free per packet (current behavior) + * 2. Ring buffer slot reuse (proposed optimization) + * 3. Pool allocator (alternative) + */ + +#define _GNU_SOURCE +#include +#include +#include + +#include "bench_common.h" +#include "flow.h" + +/* Size of flow_pkt_list structure (approximate) */ +#define ALLOC_SIZE 600 + +/* Ring buffer for proposed optimization */ +#define RING_SIZE 4096 +static uint8_t ring_buffer[RING_SIZE][ALLOC_SIZE]; +static uint32_t ring_head = 0; + +/* Simple pool allocator for comparison */ +#define POOL_SIZE 4096 +static uint8_t pool_buffer[POOL_SIZE][ALLOC_SIZE]; +static uint8_t *pool_free_list[POOL_SIZE]; +static int pool_free_count = POOL_SIZE; + +static void pool_init(void) +{ + for (int i = 0; i < POOL_SIZE; i++) { + pool_free_list[i] = pool_buffer[i]; + } + pool_free_count = POOL_SIZE; +} + +static void *pool_alloc(void) +{ + if (pool_free_count == 0) + return NULL; + return pool_free_list[--pool_free_count]; +} + +static void pool_free(void *ptr) +{ + if (pool_free_count < POOL_SIZE) + pool_free_list[pool_free_count++] = ptr; +} + +/* Benchmark 1: malloc + free (current behavior) */ +static void bench_malloc_free(void *arg) +{ + (void)arg; + void *p = malloc(ALLOC_SIZE); + BENCH_DONT_OPTIMIZE(p); + /* Touch memory to ensure it's allocated */ + memset(p, 0, 64); + free(p); +} + +/* Benchmark 2: Ring buffer slot reuse (proposed) */ +static void bench_ring_buffer(void *arg) +{ + (void)arg; + void *p = ring_buffer[ring_head & (RING_SIZE - 1)]; + ring_head++; + BENCH_DONT_OPTIMIZE(p); + /* Touch memory like we would in real usage */ + memset(p, 0, 64); + /* No free needed - slot is reused on wrap */ +} + +/* Benchmark 3: Pool allocator */ +static void bench_pool_alloc(void *arg) +{ + (void)arg; + void *p = pool_alloc(); + BENCH_DONT_OPTIMIZE(p); + if (p) { + memset(p, 0, 64); + pool_free(p); + } +} + +int main(void) +{ + struct bench_result r1, r2, r3; + const uint64_t iterations = 1000000; + + printf("\n=== Per-Packet Allocation Benchmark ===\n"); + printf("Allocation size: %d bytes (approx flow_pkt_list)\n", ALLOC_SIZE); + + pool_init(); + + bench_report_header(); + + bench_run("malloc + free (current)", bench_malloc_free, NULL, + iterations, &r1); + bench_report(&r1); + + bench_run("ring buffer (proposed)", bench_ring_buffer, NULL, + iterations, &r2); + bench_report(&r2); + + bench_run("pool allocator (alternative)", bench_pool_alloc, NULL, + iterations, &r3); + bench_report(&r3); + + printf("\n--- Analysis ---\n"); + printf("malloc/free cost: %.1f cycles (%.1f ns)\n", + r1.cycles_per_op, r1.ns_per_op); + printf("Ring buffer cost: %.1f cycles (%.1f ns)\n", + r2.cycles_per_op, r2.ns_per_op); + printf("Pool allocator cost: %.1f cycles (%.1f ns)\n", + r3.cycles_per_op, r3.ns_per_op); + printf("Ring buffer savings: %.1f%% vs malloc\n", + 100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op); + + return 0; +} diff --git a/deps/toptalk/bench_regression.c b/deps/toptalk/bench_regression.c new file mode 100644 index 0000000..63bee9b --- /dev/null +++ b/deps/toptalk/bench_regression.c @@ -0,0 +1,366 @@ +/* + * bench_regression.c - Performance regression tests + * + * Verifies that key operations meet minimum performance thresholds. + * Run with: make bench-regression && ./bench-regression + * + * Returns 0 if all tests pass, 1 if any test fails. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "bench_common.h" +#include "flow.h" +#include "decode.h" +#include "intervals.h" +#include "uthash.h" + +/* Test thresholds (cycles per operation) - set conservatively high */ +#define DECODE_THRESHOLD_CYCLES 500 /* Header parsing */ +#define MALLOC_THRESHOLD_CYCLES 200 /* Allocation overhead */ +#define RINGBUF_THRESHOLD_CYCLES 100 /* Ring buffer ops including memset */ +#define TOPN_THRESHOLD_CYCLES 5000 /* Top-N selection for 100 flows */ +#define UPDATE_STATS_THRESHOLD 1000 /* Per-packet stats update (target: 200) */ + +/* Test iteration counts - enough for stable measurements */ +#define DECODE_ITERATIONS 10000 +#define MALLOC_ITERATIONS 100000 +#define TOPN_ITERATIONS 1000 +#define UPDATE_STATS_ITERATIONS 100000 + +static int tests_failed = 0; + +/* Synthetic TCP packet for decode test */ +static uint8_t test_packet[128]; +static struct pcap_pkthdr test_pkthdr; + +static void build_test_packet(void) +{ + memset(test_packet, 0, sizeof(test_packet)); + + /* Ethernet header (14 bytes) */ + uint8_t *eth = test_packet; + eth[12] = 0x08; /* EtherType: IPv4 (0x0800) */ + eth[13] = 0x00; + + /* IPv4 header (20 bytes) at offset 14 */ + uint8_t *ip4 = test_packet + 14; + ip4[0] = 0x45; /* Version 4, IHL 5 (20 bytes) */ + ip4[1] = 0x00; /* DSCP/ECN */ + ip4[2] = 0x00; /* Total length: 66 (20 IP + 20 TCP + 26 payload) */ + ip4[3] = 0x42; + ip4[8] = 0x40; /* TTL */ + ip4[9] = 0x06; /* Protocol: TCP */ + /* Source IP: 10.0.0.1 */ + ip4[12] = 10; ip4[13] = 0; ip4[14] = 0; ip4[15] = 1; + /* Dest IP: 10.0.0.2 */ + ip4[16] = 10; ip4[17] = 0; ip4[18] = 0; ip4[19] = 2; + + /* TCP header (20 bytes) at offset 34 */ + uint8_t *tcp = test_packet + 34; + tcp[0] = 0x04; /* Source port: 1234 */ + tcp[1] = 0xD2; + tcp[2] = 0x00; /* Dest port: 80 */ + tcp[3] = 0x50; + /* Sequence number */ + tcp[4] = 0x00; tcp[5] = 0x00; tcp[6] = 0x10; tcp[7] = 0x00; + /* Ack number */ + tcp[8] = 0x00; tcp[9] = 0x00; tcp[10] = 0x20; tcp[11] = 0x00; + tcp[12] = 0x50; /* Data offset: 5 (20 bytes), flags: 0 */ + tcp[13] = 0x10; /* ACK flag */ + tcp[14] = 0xFF; /* Window: 65535 */ + tcp[15] = 0xFF; + + /* Payload starts at offset 54 */ + memset(test_packet + 54, 'A', 26); + + /* Pcap header */ + test_pkthdr.ts.tv_sec = 0; + test_pkthdr.ts.tv_usec = 0; + test_pkthdr.caplen = 80; + test_pkthdr.len = 80; +} + +/* + * Test 1: Decode performance + */ +static void bench_decode_fn(void *arg) +{ + (void)arg; + struct flow_pkt pkt; + char errbuf[256]; + int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf); + BENCH_DONT_OPTIMIZE(ret); + BENCH_DONT_OPTIMIZE(pkt.flow_rec.flow.proto); +} + +static int test_decode_performance(void) +{ + struct bench_result r; + + build_test_packet(); + bench_run("decode_ethernet", bench_decode_fn, NULL, DECODE_ITERATIONS, &r); + + printf(" decode: %.1f cycles/op (threshold: %d)\n", + r.cycles_per_op, DECODE_THRESHOLD_CYCLES); + + if (r.cycles_per_op > DECODE_THRESHOLD_CYCLES) { + printf(" FAIL: decode too slow\n"); + return 1; + } + return 0; +} + +/* + * Test 2: Ring buffer performance (simulated) + */ +#define RING_SIZE 4096 +static uint8_t ring_buffer[RING_SIZE][600]; +static uint32_t ring_head = 0; + +static void bench_ringbuf_fn(void *arg) +{ + (void)arg; + void *p = ring_buffer[ring_head & (RING_SIZE - 1)]; + ring_head++; + BENCH_DONT_OPTIMIZE(p); + memset(p, 0, 64); +} + +static int test_ringbuf_performance(void) +{ + struct bench_result r; + + ring_head = 0; + bench_run("ring_buffer", bench_ringbuf_fn, NULL, MALLOC_ITERATIONS, &r); + + printf(" ring buffer: %.1f cycles/op (threshold: %d)\n", + r.cycles_per_op, RINGBUF_THRESHOLD_CYCLES); + + if (r.cycles_per_op > RINGBUF_THRESHOLD_CYCLES) { + printf(" FAIL: ring buffer too slow\n"); + return 1; + } + return 0; +} + +/* + * Test 3: Top-N selection performance + */ +struct topn_flow_hash { + struct flow flow; + int64_t bytes; + UT_hash_handle hh; +}; + +static struct topn_flow_hash *topn_table = NULL; +#define TOPN_FLOWS 100 +#define TOPN_N 10 + +static struct topn_flow_hash *topn_result[TOPN_N]; + +static void populate_topn_table(void) +{ + for (int i = 0; i < TOPN_FLOWS; i++) { + struct topn_flow_hash *entry = calloc(1, sizeof(*entry)); + entry->flow.ethertype = 0x0800; + entry->flow.src_ip.s_addr = htonl(0x0a000001 + i); + entry->flow.dst_ip.s_addr = htonl(0x0a000100); + entry->flow.sport = 1024 + i; + entry->flow.dport = 80; + entry->bytes = rand() % 1000000; + HASH_ADD(hh, topn_table, flow, sizeof(struct flow), entry); + } +} + +static void free_topn_table(void) +{ + struct topn_flow_hash *entry, *tmp; + HASH_ITER(hh, topn_table, entry, tmp) { + HASH_DEL(topn_table, entry); + free(entry); + } + topn_table = NULL; +} + +static void find_top_n(void) +{ + struct topn_flow_hash *iter, *tmp; + int count = 0; + + HASH_ITER(hh, topn_table, iter, tmp) { + if (count < TOPN_N) { + topn_result[count++] = iter; + for (int i = count - 1; i > 0; i--) { + if (topn_result[i]->bytes > topn_result[i-1]->bytes) { + struct topn_flow_hash *t = topn_result[i]; + topn_result[i] = topn_result[i-1]; + topn_result[i-1] = t; + } + } + } else if (iter->bytes > topn_result[TOPN_N-1]->bytes) { + topn_result[TOPN_N-1] = iter; + for (int i = TOPN_N - 1; i > 0; i--) { + if (topn_result[i]->bytes > topn_result[i-1]->bytes) { + struct topn_flow_hash *t = topn_result[i]; + topn_result[i] = topn_result[i-1]; + topn_result[i-1] = t; + } + } + } + } +} + +static void bench_topn_fn(void *arg) +{ + (void)arg; + find_top_n(); + BENCH_DONT_OPTIMIZE(topn_result[0]); +} + +static int test_topn_performance(void) +{ + struct bench_result r; + + srand(42); + populate_topn_table(); + + bench_run("top-N selection", bench_topn_fn, NULL, TOPN_ITERATIONS, &r); + + free_topn_table(); + + printf(" top-N: %.1f cycles/op (threshold: %d)\n", + r.cycles_per_op, TOPN_THRESHOLD_CYCLES); + + if (r.cycles_per_op > TOPN_THRESHOLD_CYCLES) { + printf(" FAIL: top-N selection too slow\n"); + return 1; + } + return 0; +} + +/* + * Test 4: Per-packet stats update (the real hot path) + * + * This measures the actual per-packet processing path: + * - Ring buffer insertion + * - Hash table lookup/insert + * - Interval delta accumulation + * - Expiration check + * + * Tests two scenarios: + * - Single flow (best case: hash hit) + * - Many flows (stress test: hash table growth) + */ +static struct flow_pkt bench_pkt; +static uint32_t bench_pkt_counter = 0; + +static void bench_update_stats_single_flow_fn(void *arg) +{ + (void)arg; + /* Same flow every time - hash hit case */ + bench_pkt.timestamp.tv_usec++; + if (bench_pkt.timestamp.tv_usec >= 1000000) { + bench_pkt.timestamp.tv_usec = 0; + bench_pkt.timestamp.tv_sec++; + } + tt_bench_update_stats(&bench_pkt); + BENCH_DONT_OPTIMIZE(bench_pkt.flow_rec.bytes); +} + +static void bench_update_stats_many_flows_fn(void *arg) +{ + (void)arg; + /* Different flow each time up to 10K flows, then wrap */ + bench_pkt_counter++; + bench_pkt.flow_rec.flow.sport = 1024 + (bench_pkt_counter % 10000); + bench_pkt.timestamp.tv_usec++; + if (bench_pkt.timestamp.tv_usec >= 1000000) { + bench_pkt.timestamp.tv_usec = 0; + bench_pkt.timestamp.tv_sec++; + } + tt_bench_update_stats(&bench_pkt); + BENCH_DONT_OPTIMIZE(bench_pkt.flow_rec.bytes); +} + +static int test_update_stats_performance(void) +{ + struct bench_result r_single, r_many; + + /* Initialize intervals subsystem */ + if (tt_bench_init() != 0) { + printf(" FAIL: could not initialize benchmark\n"); + return 1; + } + + /* Setup test packet */ + memset(&bench_pkt, 0, sizeof(bench_pkt)); + bench_pkt.flow_rec.flow.ethertype = 0x0800; + bench_pkt.flow_rec.flow.src_ip.s_addr = htonl(0x0a000001); + bench_pkt.flow_rec.flow.dst_ip.s_addr = htonl(0x0a000002); + bench_pkt.flow_rec.flow.sport = 12345; + bench_pkt.flow_rec.flow.dport = 80; + bench_pkt.flow_rec.flow.proto = 6; /* TCP */ + bench_pkt.flow_rec.bytes = 1000; + bench_pkt.flow_rec.packets = 1; + bench_pkt.timestamp.tv_sec = 1000000; + bench_pkt.timestamp.tv_usec = 0; + + /* Test single flow (hash hit case) */ + bench_run("update_stats (1 flow)", bench_update_stats_single_flow_fn, + NULL, UPDATE_STATS_ITERATIONS, &r_single); + + /* Cleanup and reinit for many-flow test */ + tt_bench_cleanup(); + tt_bench_init(); + bench_pkt_counter = 0; + bench_pkt.timestamp.tv_sec = 1000000; + bench_pkt.timestamp.tv_usec = 0; + + /* Test many flows (hash table stress) */ + bench_run("update_stats (10K flows)", bench_update_stats_many_flows_fn, + NULL, UPDATE_STATS_ITERATIONS, &r_many); + + tt_bench_cleanup(); + + printf(" single flow: %.1f cycles/op\n", r_single.cycles_per_op); + printf(" 10K flows: %.1f cycles/op\n", r_many.cycles_per_op); + printf(" threshold: %d cycles/op\n", UPDATE_STATS_THRESHOLD); + + if (r_single.cycles_per_op > UPDATE_STATS_THRESHOLD) { + printf(" FAIL: single flow update too slow\n"); + return 1; + } + if (r_many.cycles_per_op > UPDATE_STATS_THRESHOLD * 2) { + printf(" FAIL: many flow update too slow\n"); + return 1; + } + return 0; +} + +int main(void) +{ + printf("\n=== Performance Regression Tests ===\n\n"); + + printf("Test 1: Header decoding\n"); + tests_failed += test_decode_performance(); + + printf("\nTest 2: Ring buffer allocation\n"); + tests_failed += test_ringbuf_performance(); + + printf("\nTest 3: Top-N flow selection\n"); + tests_failed += test_topn_performance(); + + printf("\nTest 4: Per-packet stats update\n"); + tests_failed += test_update_stats_performance(); + + printf("\n=== Results: %d test(s) failed ===\n", + tests_failed); + + return tests_failed > 0 ? 1 : 0; +} diff --git a/deps/toptalk/bench_rotation.c b/deps/toptalk/bench_rotation.c new file mode 100644 index 0000000..f3cd882 --- /dev/null +++ b/deps/toptalk/bench_rotation.c @@ -0,0 +1,157 @@ +/* + * bench_rotation.c - Benchmark for interval table rotation + * + * Measures the cost of: + * 1. Copy-based rotation (current clear_table behavior) + * 2. Pointer swap rotation (proposed optimization) + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "bench_common.h" +#include "uthash.h" +#include "flow.h" + +/* Simplified flow hash entry for benchmarking */ +struct bench_flow_hash { + struct flow flow; + int64_t bytes; + int64_t packets; + UT_hash_handle hh; +}; + +/* Tables for copy-based benchmark */ +static struct bench_flow_hash *copy_incomplete = NULL; +static struct bench_flow_hash *copy_complete = NULL; + +/* Tables for swap-based benchmark */ +static struct bench_flow_hash *swap_tables[2] = {NULL, NULL}; +static int swap_write_idx = 0; + +/* Number of flows to test with */ +#define NUM_FLOWS 100 + +/* Populate a table with NUM_FLOWS entries */ +static void populate_table(struct bench_flow_hash **table) +{ + for (int i = 0; i < NUM_FLOWS; i++) { + struct bench_flow_hash *entry = calloc(1, sizeof(*entry)); + entry->flow.ethertype = 0x0800; + entry->flow.src_ip.s_addr = htonl(0x0a000001 + i); + entry->flow.dst_ip.s_addr = htonl(0x0a000100); + entry->flow.sport = 1024 + i; + entry->flow.dport = 80; + entry->bytes = 1000 * (i + 1); + entry->packets = 10 * (i + 1); + HASH_ADD(hh, *table, flow, sizeof(struct flow), entry); + } +} + +/* Free all entries in a table */ +static void free_table(struct bench_flow_hash **table) +{ + struct bench_flow_hash *entry, *tmp; + HASH_ITER(hh, *table, entry, tmp) { + HASH_DEL(*table, entry); + free(entry); + } + *table = NULL; +} + +/* Copy-based rotation (current behavior) */ +static void rotate_copy(void) +{ + struct bench_flow_hash *entry, *tmp; + + /* Free old complete table */ + free_table(©_complete); + + /* Copy incomplete to complete */ + HASH_ITER(hh, copy_incomplete, entry, tmp) { + struct bench_flow_hash *n = malloc(sizeof(*n)); + memcpy(n, entry, sizeof(*n)); + memset(&n->hh, 0, sizeof(n->hh)); + HASH_ADD(hh, copy_complete, flow, sizeof(struct flow), n); + } + + /* Free incomplete */ + free_table(©_incomplete); +} + +/* Swap-based rotation (proposed) */ +static void rotate_swap(void) +{ + int read_idx = 1 - swap_write_idx; + + /* Free old read table */ + free_table(&swap_tables[read_idx]); + + /* Swap: write becomes read, allocate new write */ + swap_tables[read_idx] = swap_tables[swap_write_idx]; + swap_tables[swap_write_idx] = NULL; + + /* Flip indices */ + swap_write_idx = read_idx; +} + +/* Benchmark 1: Copy-based rotation */ +static void bench_copy_rotation(void *arg) +{ + (void)arg; + /* Re-populate incomplete table for each iteration */ + populate_table(©_incomplete); + rotate_copy(); +} + +/* Benchmark 2: Swap-based rotation */ +static void bench_swap_rotation(void *arg) +{ + (void)arg; + /* Re-populate write table for each iteration */ + populate_table(&swap_tables[swap_write_idx]); + rotate_swap(); +} + +int main(void) +{ + struct bench_result r1, r2; + const uint64_t iterations = 1000; + + printf("\n=== Interval Table Rotation Benchmark ===\n"); + printf("Flows per table: %d\n", NUM_FLOWS); + + bench_report_header(); + + bench_run("copy-based rotation (current)", bench_copy_rotation, NULL, + iterations, &r1); + bench_report(&r1); + + /* Reset for swap benchmark */ + free_table(&swap_tables[0]); + free_table(&swap_tables[1]); + swap_write_idx = 0; + + bench_run("swap-based rotation (proposed)", bench_swap_rotation, NULL, + iterations, &r2); + bench_report(&r2); + + printf("\n--- Analysis ---\n"); + printf("Copy rotation: %.1f cycles (%.1f ns) per rotation\n", + r1.cycles_per_op, r1.ns_per_op); + printf("Swap rotation: %.1f cycles (%.1f ns) per rotation\n", + r2.cycles_per_op, r2.ns_per_op); + printf("Improvement: %.1f%% fewer cycles\n", + 100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op); + + /* Cleanup */ + free_table(©_incomplete); + free_table(©_complete); + free_table(&swap_tables[0]); + free_table(&swap_tables[1]); + + return 0; +} diff --git a/deps/toptalk/bench_sort.c b/deps/toptalk/bench_sort.c new file mode 100644 index 0000000..2a81b2e --- /dev/null +++ b/deps/toptalk/bench_sort.c @@ -0,0 +1,181 @@ +/* + * bench_sort.c - Benchmark for flow sorting + * + * Measures the cost of: + * 1. HASH_SRT (full sort every time - current behavior) + * 2. Partial sort / selection algorithm (proposed) + * 3. Incremental heap tracking (alternative) + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "bench_common.h" +#include "uthash.h" +#include "flow.h" + +/* Flow hash entry for sorting benchmarks */ +struct sort_flow_hash { + struct flow flow; + int64_t bytes; + int64_t packets; + UT_hash_handle hh; +}; + +/* Tables for different sorting approaches */ +static struct sort_flow_hash *hash_srt_table = NULL; +static struct sort_flow_hash *partial_sort_table = NULL; + +/* Number of flows and top-N to extract */ +#define NUM_FLOWS 100 +#define TOP_N 10 + +/* Comparison function for uthash HASH_SRT */ +static int bytes_cmp(struct sort_flow_hash *a, struct sort_flow_hash *b) +{ + /* Descending order */ + return (b->bytes > a->bytes) - (b->bytes < a->bytes); +} + +/* Populate a table with randomized byte counts */ +static void populate_sort_table(struct sort_flow_hash **table, unsigned int seed) +{ + srand(seed); + for (int i = 0; i < NUM_FLOWS; i++) { + struct sort_flow_hash *entry = calloc(1, sizeof(*entry)); + entry->flow.ethertype = 0x0800; + entry->flow.src_ip.s_addr = htonl(0x0a000001 + i); + entry->flow.dst_ip.s_addr = htonl(0x0a000100); + entry->flow.sport = 1024 + i; + entry->flow.dport = 80; + entry->bytes = rand() % 1000000; + entry->packets = entry->bytes / 100; + HASH_ADD(hh, *table, flow, sizeof(struct flow), entry); + } +} + +/* Free all entries in a table */ +static void free_sort_table(struct sort_flow_hash **table) +{ + struct sort_flow_hash *entry, *tmp; + HASH_ITER(hh, *table, entry, tmp) { + HASH_DEL(*table, entry); + free(entry); + } + *table = NULL; +} + +/* Partial sort: find top N using selection algorithm */ +static void find_top_n(struct sort_flow_hash *table, + struct sort_flow_hash **top, + int n) +{ + /* Simple O(n) approach: track top N in an array */ + int count = 0; + struct sort_flow_hash *entry, *tmp; + + HASH_ITER(hh, table, entry, tmp) { + if (count < n) { + /* Fill top array first */ + top[count++] = entry; + /* Keep sorted with insertion sort (small array) */ + for (int i = count - 1; i > 0; i--) { + if (top[i]->bytes > top[i-1]->bytes) { + struct sort_flow_hash *t = top[i]; + top[i] = top[i-1]; + top[i-1] = t; + } + } + } else if (entry->bytes > top[n-1]->bytes) { + /* Replace smallest in top */ + top[n-1] = entry; + /* Re-sort */ + for (int i = n - 1; i > 0; i--) { + if (top[i]->bytes > top[i-1]->bytes) { + struct sort_flow_hash *t = top[i]; + top[i] = top[i-1]; + top[i-1] = t; + } + } + } + } +} + +/* Benchmark context */ +struct sort_bench_ctx { + struct sort_flow_hash *top[TOP_N]; +}; + +static struct sort_bench_ctx ctx; + +/* Benchmark 1: HASH_SRT (current behavior) */ +static void bench_hash_srt(void *arg) +{ + (void)arg; + HASH_SRT(hh, hash_srt_table, bytes_cmp); + + /* Iterate to get top N (what the real code does) */ + struct sort_flow_hash *entry = hash_srt_table; + for (int i = 0; i < TOP_N && entry; i++) { + BENCH_DONT_OPTIMIZE(entry->bytes); + entry = entry->hh.next; + } +} + +/* Benchmark 2: Partial sort / selection */ +static void bench_partial_sort(void *arg) +{ + struct sort_bench_ctx *c = arg; + find_top_n(partial_sort_table, c->top, TOP_N); + + for (int i = 0; i < TOP_N; i++) { + BENCH_DONT_OPTIMIZE(c->top[i]->bytes); + } +} + +int main(void) +{ + struct bench_result r1, r2; + const uint64_t iterations = 1000; + + printf("\n=== Flow Sorting Benchmark ===\n"); + printf("Total flows: %d, extracting top %d\n", NUM_FLOWS, TOP_N); + + /* Create tables with same random data */ + populate_sort_table(&hash_srt_table, 42); + populate_sort_table(&partial_sort_table, 42); + + bench_report_header(); + + bench_run("HASH_SRT full sort (current)", bench_hash_srt, NULL, + iterations, &r1); + bench_report(&r1); + + bench_run("partial sort top-N (proposed)", bench_partial_sort, &ctx, + iterations, &r2); + bench_report(&r2); + + printf("\n--- Analysis ---\n"); + printf("HASH_SRT cost: %.1f cycles (%.1f ns) per sort\n", + r1.cycles_per_op, r1.ns_per_op); + printf("Partial sort: %.1f cycles (%.1f ns) per sort\n", + r2.cycles_per_op, r2.ns_per_op); + printf("Improvement: %.1f%% fewer cycles\n", + 100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op); + + /* Note: At 1ms tick rate, sorting happens 1000x/sec */ + printf("\nAt 1000 sorts/sec:\n"); + printf(" HASH_SRT: %.0f us/sec overhead\n", + r1.ns_per_op * 1000 / 1000.0); + printf(" Partial sort: %.0f us/sec overhead\n", + r2.ns_per_op * 1000 / 1000.0); + + /* Cleanup */ + free_sort_table(&hash_srt_table); + free_sort_table(&partial_sort_table); + + return 0; +} diff --git a/deps/toptalk/intervals.c b/deps/toptalk/intervals.c index a2d33db..3182417 100644 --- a/deps/toptalk/intervals.c +++ b/deps/toptalk/intervals.c @@ -620,6 +620,70 @@ static void update_stats_tables(struct flow_pkt *pkt) } } +/* + * Benchmark hooks: expose internals for performance testing. + */ + +/* Initialize just the data structures needed for benchmarking (no pcap) */ +int tt_bench_init(void) +{ + ref_window_size = (struct timeval){.tv_sec = 3, .tv_usec = 0 }; + flow_ref_table = NULL; + + /* Allocate ring buffer */ + if (pkt_ring.entries == NULL) { + if (ring_buffer_alloc(TT_DEFAULT_RING_SIZE) != 0) { + return 1; + } + } else { + pkt_ring.head = 0; + pkt_ring.tail = 0; + } + last_pkt_time = (struct timeval){ 0 }; + + /* Initialize flow pools */ + if (ref_flow_pool.entries == NULL) { + if (flow_pool_init(&ref_flow_pool, FLOW_POOL_SIZE) != 0) { + ring_buffer_free(); + return 1; + } + } + if (interval_flow_pool.entries == NULL) { + if (flow_pool_init(&interval_flow_pool, FLOW_POOL_SIZE * 2) != 0) { + flow_pool_cleanup(&ref_flow_pool); + ring_buffer_free(); + return 1; + } + } + + totals.bytes = 0; + totals.packets = 0; + + return 0; +} + +/* Cleanup benchmark resources */ +void tt_bench_cleanup(void) +{ + /* Clear flow tables */ + struct flow_hash *iter, *tmp; + HASH_ITER(r_hh, flow_ref_table, iter, tmp) { + HASH_DELETE(r_hh, flow_ref_table, iter); + flow_pool_free(&ref_flow_pool, iter); + } + flow_ref_table = NULL; + + ring_buffer_free(); + flow_pool_cleanup(&ref_flow_pool); + flow_pool_cleanup(&interval_flow_pool); +} + +/* Process a packet through the stats update path */ +void tt_bench_update_stats(struct flow_pkt *pkt) +{ + update_stats_tables(pkt); +} + #define DEBUG 1 #if DEBUG static void dbg_per_second(struct tt_top_flows *t5) diff --git a/deps/toptalk/intervals.h b/deps/toptalk/intervals.h index bdb1c31..a433f03 100644 --- a/deps/toptalk/intervals.h +++ b/deps/toptalk/intervals.h @@ -68,4 +68,19 @@ void tt_set_rtp_forward_callback(tt_rtp_forward_cb cb); exit(EXIT_FAILURE); \ } while (0) +/* + * Benchmark hooks for performance testing. + * These expose internal functions without pcap/thread overhead. + */ + +/* Initialize data structures for benchmarking (no pcap needed) */ +int tt_bench_init(void); + +/* Cleanup after benchmarking */ +void tt_bench_cleanup(void); + +/* Process a decoded packet through the stats update path. + * This is the hot path that runs for every captured packet. */ +void tt_bench_update_stats(struct flow_pkt *pkt); + #endif