diff --git a/deps/toptalk/Makefile b/deps/toptalk/Makefile
index 53d99bc..d2f7da2 100644
--- a/deps/toptalk/Makefile
+++ b/deps/toptalk/Makefile
@@ -6,6 +6,12 @@ TEST_WINDOW = test-tcp-window
 TEST_VIDEO = test-video-detect
 TEST_RTSP = test-rtsp-tap
 
+# Benchmark targets
+BENCH_DECODE = bench-decode
+BENCH_MALLOC = bench-malloc
+BENCH_ROTATION = bench-rotation
+BENCH_SORT = bench-sort
+
 SRC = \
  decode.c \
  intervals.c \
@@ -25,7 +31,8 @@ HEADERS = \
  tcp_rtt.h \
  tcp_window.h \
  video_detect.h \
- video_metrics.h
+ video_metrics.h \
+ bench_common.h
 
 ifndef INTERVAL_COUNT
 INTERVAL_COUNT = 8
@@ -109,6 +116,58 @@ $(TEST_RTSP): $(LIB) test_rtsp_tap.c
 	@echo Building $(TEST_RTSP)
 	$(CC) -o $(TEST_RTSP) test_rtsp_tap.c $(LIB) $(LDLIBS) $(LDFLAGS) $(CFLAGS)
 
+# Benchmark targets - built without sanitizers for accurate timing
+# These need a clean library build without ASAN
+BENCH_CFLAGS := -g -O2 -Wall -pedantic -std=c11 $(DEFINES) -fPIC -fno-omit-frame-pointer
+BENCH_LDFLAGS := -lrt -lpthread $(PKGCONFIG_PCAP) $(PKGCONFIG_CURSES)
+
+# Library built without sanitizers for benchmarks
+BENCH_LIB = toptalk-bench.a
+
+$(BENCH_LIB): $(SRC) $(HEADERS) Makefile
+	@echo Building $(BENCH_LIB) without sanitizers
+	$(CC) -c $(SRC) $(BENCH_CFLAGS)
+	gcc-ar cr $(BENCH_LIB) *.o
+	@echo -e "$(BENCH_LIB) OK\n"
+
+$(BENCH_DECODE): $(BENCH_LIB) bench_decode.c bench_common.c bench_common.h
+	@echo Building $(BENCH_DECODE)
+	$(CC) -o $(BENCH_DECODE) bench_decode.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS)
+
+$(BENCH_MALLOC): $(BENCH_LIB) bench_malloc.c bench_common.c bench_common.h
+	@echo Building $(BENCH_MALLOC)
+	$(CC) -o $(BENCH_MALLOC) bench_malloc.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS)
+
+$(BENCH_ROTATION): $(BENCH_LIB) bench_rotation.c bench_common.c bench_common.h
+	@echo Building $(BENCH_ROTATION)
+	$(CC) -o $(BENCH_ROTATION) bench_rotation.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS)
+
+$(BENCH_SORT): $(BENCH_LIB) bench_sort.c bench_common.c bench_common.h
+	@echo Building $(BENCH_SORT)
+	$(CC) -o $(BENCH_SORT) bench_sort.c bench_common.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS)
+
+BENCH_REGRESSION = bench-regression
+
+$(BENCH_REGRESSION): $(BENCH_LIB) bench_regression.c bench_common.c bench_common.h
+	@echo Building $(BENCH_REGRESSION)
+	$(CC) -o $(BENCH_REGRESSION) bench_regression.c bench_common.c timeywimey.c $(BENCH_LIB) $(LDLIBS) $(BENCH_LDFLAGS) $(BENCH_CFLAGS)
+
+.PHONY: bench
+bench: $(BENCH_DECODE) $(BENCH_MALLOC) $(BENCH_ROTATION) $(BENCH_SORT)
+	@echo "Running decode benchmark..."
+	@./$(BENCH_DECODE)
+	@echo "Running malloc benchmark..."
+	@./$(BENCH_MALLOC)
+	@echo "Running rotation benchmark..."
+	@./$(BENCH_ROTATION)
+	@echo "Running sort benchmark..."
+	@./$(BENCH_SORT)
+
+.PHONY: bench-test
+bench-test: $(BENCH_REGRESSION)
+	@echo "Running performance regression tests..."
+	@./$(BENCH_REGRESSION)
+
 .PHONY: test
 test: $(TEST) $(TEST_RTT) $(TEST_WINDOW) $(TEST_VIDEO) $(TEST_RTSP)
 	@echo "Running RTT unit tests (no root required)..."
@@ -133,4 +192,5 @@ clang-analyze: clean
 .PHONY: clean
 clean:
 	rm $(LIB) $(PROG) $(TEST) $(TEST_RTT) $(TEST_WINDOW) $(TEST_VIDEO) $(TEST_RTSP) *.o *.a || true
+	rm $(BENCH_DECODE) $(BENCH_MALLOC) $(BENCH_ROTATION) $(BENCH_SORT) $(BENCH_REGRESSION) || true
 	rm *.gcno *.gcov *.gcda || true
diff --git a/deps/toptalk/bench_common.c b/deps/toptalk/bench_common.c
new file mode 100644
index 0000000..fcc6fcb
--- /dev/null
+++ b/deps/toptalk/bench_common.c
@@ -0,0 +1,90 @@
+/*
+ * bench_common.c - Benchmark utilities implementation
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "bench_common.h"
+
+/* Get CPU frequency from /proc/cpuinfo (Linux) */
+uint64_t bench_get_cpu_freq(void)
+{
+	FILE *f = fopen("/proc/cpuinfo", "r");
+	if (!f) {
+		/* Fallback: assume 3 GHz */
+		return 3000000000ULL;
+	}
+
+	char line[256];
+	double mhz = 0.0;
+
+	while (fgets(line, sizeof(line), f)) {
+		if (strncmp(line, "cpu MHz", 7) == 0) {
+			char *colon = strchr(line, ':');
+			if (colon) {
+				mhz = atof(colon + 1);
+				break;
+			}
+		}
+	}
+	fclose(f);
+
+	if (mhz < 100.0) {
+		/* Fallback: assume 3 GHz */
+		return 3000000000ULL;
+	}
+
+	return (uint64_t)(mhz * 1e6);
+}
+
+void bench_run(const char *name,
+               void (*fn)(void *arg),
+               void *arg,
+               uint64_t iterations,
+               struct bench_result *result)
+{
+	uint64_t freq = bench_get_cpu_freq();
+	uint64_t start, end, total = 0;
+
+	/* Warm up - run a few iterations to prime caches */
+	for (uint64_t i = 0; i < 100 && i < iterations; i++) {
+		fn(arg);
+	}
+
+	/* Timed run */
+	start = bench_start();
+	for (uint64_t i = 0; i < iterations; i++) {
+		fn(arg);
+	}
+	end = bench_end();
+	total = bench_cycles(start, end);
+
+	/* Fill in results */
+	result->name = name;
+	result->iterations = iterations;
+	result->total_cycles = total;
+	result->cycles_per_op = (double)total / (double)iterations;
+	result->ns_per_op = bench_cycles_to_ns(total, freq) / (double)iterations;
+}
+
+void bench_report_header(void)
+{
+	printf("\n%-40s %12s %12s %12s\n",
+	       "Benchmark", "Iterations", "Cycles/op", "ns/op");
+	printf("%-40s %12s %12s %12s\n",
+	       "----------------------------------------",
+	       "------------", "------------", "------------");
+}
+
+void bench_report(const struct bench_result *result)
+{
+	printf("%-40s %12lu %12.1f %12.1f\n",
+	       result->name,
+	       result->iterations,
+	       result->cycles_per_op,
+	       result->ns_per_op);
+}
diff --git a/deps/toptalk/bench_common.h b/deps/toptalk/bench_common.h
new file mode 100644
index 0000000..ec8d6c1
--- /dev/null
+++ b/deps/toptalk/bench_common.h
@@ -0,0 +1,122 @@
+/*
+ * bench_common.h - Benchmark utilities for performance measurement
+ *
+ * Provides:
+ * - Cycle-accurate timing using rdtsc (x86_64)
+ * - Wall-clock timing utilities
+ * - Benchmark runner and reporting
+ */
+
+#ifndef BENCH_COMMON_H
+#define BENCH_COMMON_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+/* Benchmark result structure */
+struct bench_result {
+	const char *name;
+	uint64_t iterations;
+	uint64_t total_cycles;
+	double cycles_per_op;
+	double ns_per_op;
+};
+
+/*
+ * Read CPU timestamp counter (x86_64).
+ * Returns current cycle count. Use bench_cycles() to compute elapsed.
+ */
+static inline uint64_t bench_start(void)
+{
+	uint32_t lo, hi;
+	/* Serialize to ensure timing is accurate */
+	__asm__ volatile (
+		"cpuid\n\t"
+		"rdtsc\n\t"
+		: "=a" (lo), "=d" (hi)
+		: "a" (0)
+		: "rbx", "rcx"
+	);
+	return ((uint64_t)hi << 32) | lo;
+}
+
+/*
+ * Read timestamp counter at end of measurement.
+ * Uses rdtscp for better serialization on modern CPUs.
+ */
+static inline uint64_t bench_end(void)
+{
+	uint32_t lo, hi;
+	__asm__ volatile (
+		"rdtscp\n\t"
+		"mov %%eax, %0\n\t"
+		"mov %%edx, %1\n\t"
+		"cpuid\n\t"
+		: "=r" (lo), "=r" (hi)
+		:
+		: "rax", "rbx", "rcx", "rdx"
+	);
+	return ((uint64_t)hi << 32) | lo;
+}
+
+/*
+ * Compute elapsed cycles between start and end.
+ */
+static inline uint64_t bench_cycles(uint64_t start, uint64_t end)
+{
+	return end - start;
+}
+
+/*
+ * Get approximate CPU frequency in Hz.
+ * Uses /proc/cpuinfo on Linux.
+ */
+uint64_t bench_get_cpu_freq(void);
+
+/*
+ * Convert cycles to nanoseconds given CPU frequency.
+ */
+static inline double bench_cycles_to_ns(uint64_t cycles, uint64_t freq_hz)
+{
+	return (double)cycles * 1e9 / (double)freq_hz;
+}
+
+/*
+ * Run a benchmark function multiple times and collect statistics.
+ *
+ * name: Benchmark name for reporting
+ * fn: Function to benchmark (called with arg)
+ * arg: Argument passed to fn
+ * iterations: Number of times to call fn
+ * result: Output benchmark results
+ */
+void bench_run(const char *name,
+               void (*fn)(void *arg),
+               void *arg,
+               uint64_t iterations,
+               struct bench_result *result);
+
+/*
+ * Print benchmark results in a formatted table.
+ */
+void bench_report(const struct bench_result *result);
+
+/*
+ * Print header for benchmark report table.
+ */
+void bench_report_header(void);
+
+/*
+ * Prevent compiler from optimizing away a value.
+ * Use to ensure benchmark results are "used".
+ */
+#define BENCH_DONT_OPTIMIZE(val) \
+	__asm__ volatile ("" : : "r,m" (val) : "memory")
+
+/*
+ * Memory barrier to prevent reordering.
+ */
+#define BENCH_BARRIER() \
+	__asm__ volatile ("" ::: "memory")
+
+#endif /* BENCH_COMMON_H */
diff --git a/deps/toptalk/bench_decode.c b/deps/toptalk/bench_decode.c
new file mode 100644
index 0000000..0742006
--- /dev/null
+++ b/deps/toptalk/bench_decode.c
@@ -0,0 +1,199 @@
+/*
+ * bench_decode.c - Benchmark for header parsing overhead
+ *
+ * Measures the cost of:
+ * 1. Single decode pass (decode_ethernet only)
+ * 2. Decode + find_tcp_header (current behavior)
+ * 3. Decode with stored offset (proposed optimization)
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <pcap.h>
+
+#include "bench_common.h"
+#include "flow.h"
+#include "decode.h"
+
+/* Synthetic TCP packet: Ethernet + IPv4 + TCP + payload */
+static uint8_t test_packet[128];
+static struct pcap_pkthdr test_pkthdr;
+
+/* Pre-computed L4 offset for the optimized path */
+static uint16_t precomputed_l4_offset;
+
+/* Build a synthetic TCP packet for benchmarking */
+static void build_test_packet(void)
+{
+	memset(test_packet, 0, sizeof(test_packet));
+
+	/* Ethernet header (14 bytes) */
+	uint8_t *eth = test_packet;
+	eth[12] = 0x08;  /* EtherType: IPv4 (0x0800) */
+	eth[13] = 0x00;
+
+	/* IPv4 header (20 bytes) at offset 14 */
+	uint8_t *ip4 = test_packet + 14;
+	ip4[0] = 0x45;         /* Version 4, IHL 5 (20 bytes) */
+	ip4[1] = 0x00;         /* DSCP/ECN */
+	ip4[2] = 0x00;         /* Total length: 66 (20 IP + 20 TCP + 26 payload) */
+	ip4[3] = 0x42;
+	ip4[8] = 0x40;         /* TTL */
+	ip4[9] = 0x06;         /* Protocol: TCP */
+	/* Source IP: 10.0.0.1 */
+	ip4[12] = 10; ip4[13] = 0; ip4[14] = 0; ip4[15] = 1;
+	/* Dest IP: 10.0.0.2 */
+	ip4[16] = 10; ip4[17] = 0; ip4[18] = 0; ip4[19] = 2;
+
+	/* TCP header (20 bytes) at offset 34 */
+	uint8_t *tcp = test_packet + 34;
+	tcp[0] = 0x04;  /* Source port: 1234 */
+	tcp[1] = 0xD2;
+	tcp[2] = 0x00;  /* Dest port: 80 */
+	tcp[3] = 0x50;
+	/* Sequence number */
+	tcp[4] = 0x00; tcp[5] = 0x00; tcp[6] = 0x10; tcp[7] = 0x00;
+	/* Ack number */
+	tcp[8] = 0x00; tcp[9] = 0x00; tcp[10] = 0x20; tcp[11] = 0x00;
+	tcp[12] = 0x50;  /* Data offset: 5 (20 bytes), flags: 0 */
+	tcp[13] = 0x10;  /* ACK flag */
+	tcp[14] = 0xFF;  /* Window: 65535 */
+	tcp[15] = 0xFF;
+
+	/* Payload starts at offset 54 */
+	memset(test_packet + 54, 'A', 26);
+
+	/* Pcap header */
+	test_pkthdr.ts.tv_sec = 0;
+	test_pkthdr.ts.tv_usec = 0;
+	test_pkthdr.caplen = 80;
+	test_pkthdr.len = 80;
+
+	/* Pre-compute L4 offset: Ethernet (14) + IPv4 (20) = 34 */
+	precomputed_l4_offset = 34;
+}
+
+/* Benchmark 1: decode_ethernet only (baseline) */
+static void bench_decode_only(void *arg)
+{
+	(void)arg;
+	struct flow_pkt pkt;
+	char errbuf[256];
+
+	int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf);
+	BENCH_DONT_OPTIMIZE(ret);
+	BENCH_DONT_OPTIMIZE(pkt.flow_rec.flow.proto);
+}
+
+/* Forward declaration - find_tcp_header is static in intervals.c,
+ * so we simulate its work here */
+static const struct hdr_tcp *find_tcp_header_sim(const uint8_t *packet,
+                                                  uint32_t caplen)
+{
+	/* Simulate the header traversal that find_tcp_header does */
+	if (caplen < 14)
+		return NULL;
+
+	/* Check ethertype */
+	uint16_t ethertype = (packet[12] << 8) | packet[13];
+	const uint8_t *ip_start;
+
+	if (ethertype == 0x8100) {
+		/* VLAN - skip 4 more bytes */
+		ethertype = (packet[16] << 8) | packet[17];
+		ip_start = packet + 18;
+		caplen -= 18;
+	} else {
+		ip_start = packet + 14;
+		caplen -= 14;
+	}
+
+	if (ethertype != 0x0800)  /* Not IPv4 for simplicity */
+		return NULL;
+
+	if (caplen < 20)
+		return NULL;
+
+	/* IPv4 header length */
+	uint8_t ihl = (ip_start[0] & 0x0F) * 4;
+	if (caplen < ihl + 20)
+		return NULL;
+
+	/* Check protocol is TCP */
+	if (ip_start[9] != 6)
+		return NULL;
+
+	return (const struct hdr_tcp *)(ip_start + ihl);
+}
+
+/* Benchmark 2: decode_ethernet + find_tcp_header (current behavior) */
+static void bench_decode_plus_find(void *arg)
+{
+	(void)arg;
+	struct flow_pkt pkt;
+	char errbuf[256];
+
+	int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf);
+	BENCH_DONT_OPTIMIZE(ret);
+
+	/* Simulate what handle_packet does - re-parse to find TCP header */
+	const struct hdr_tcp *tcp = find_tcp_header_sim(test_packet,
+	                                                 test_pkthdr.caplen);
+	BENCH_DONT_OPTIMIZE(tcp);
+}
+
+/* Benchmark 3: decode with stored offset (proposed optimization) */
+static void bench_decode_with_offset(void *arg)
+{
+	(void)arg;
+	struct flow_pkt pkt;
+	char errbuf[256];
+
+	int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf);
+	BENCH_DONT_OPTIMIZE(ret);
+
+	/* Use pre-computed offset - O(1) pointer arithmetic */
+	const struct hdr_tcp *tcp = (const struct hdr_tcp *)(test_packet + precomputed_l4_offset);
+	BENCH_DONT_OPTIMIZE(tcp);
+}
+
+int main(void)
+{
+	struct bench_result r1, r2, r3;
+	const uint64_t iterations = 100000;
+
+	printf("\n=== Header Parsing Benchmark ===\n");
+	printf("Packet: Ethernet + IPv4 + TCP (80 bytes)\n");
+
+	build_test_packet();
+
+	bench_report_header();
+
+	bench_run("decode_ethernet (baseline)", bench_decode_only, NULL,
+	          iterations, &r1);
+	bench_report(&r1);
+
+	bench_run("decode + find_tcp_header (current)", bench_decode_plus_find, NULL,
+	          iterations, &r2);
+	bench_report(&r2);
+
+	bench_run("decode + stored offset (proposed)", bench_decode_with_offset, NULL,
+	          iterations, &r3);
+	bench_report(&r3);
+
+	printf("\n--- Analysis ---\n");
+	printf("find_tcp_header overhead: %.1f cycles (%.1f ns)\n",
+	       r2.cycles_per_op - r1.cycles_per_op,
+	       r2.ns_per_op - r1.ns_per_op);
+	printf("Stored offset overhead:   %.1f cycles (%.1f ns)\n",
+	       r3.cycles_per_op - r1.cycles_per_op,
+	       r3.ns_per_op - r1.ns_per_op);
+	printf("Savings from optimization: %.1f cycles (%.1f%%)\n",
+	       r2.cycles_per_op - r3.cycles_per_op,
+	       100.0 * (r2.cycles_per_op - r3.cycles_per_op) / r2.cycles_per_op);
+
+	return 0;
+}
diff --git a/deps/toptalk/bench_malloc.c b/deps/toptalk/bench_malloc.c
new file mode 100644
index 0000000..fa062a1
--- /dev/null
+++ b/deps/toptalk/bench_malloc.c
@@ -0,0 +1,123 @@
+/*
+ * bench_malloc.c - Benchmark for per-packet allocation overhead
+ *
+ * Measures the cost of:
+ * 1. malloc + free per packet (current behavior)
+ * 2. Ring buffer slot reuse (proposed optimization)
+ * 3. Pool allocator (alternative)
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bench_common.h"
+#include "flow.h"
+
+/* Size of flow_pkt_list structure (approximate) */
+#define ALLOC_SIZE 600
+
+/* Ring buffer for proposed optimization */
+#define RING_SIZE 4096
+static uint8_t ring_buffer[RING_SIZE][ALLOC_SIZE];
+static uint32_t ring_head = 0;
+
+/* Simple pool allocator for comparison */
+#define POOL_SIZE 4096
+static uint8_t pool_buffer[POOL_SIZE][ALLOC_SIZE];
+static uint8_t *pool_free_list[POOL_SIZE];
+static int pool_free_count = POOL_SIZE;
+
+static void pool_init(void)
+{
+	for (int i = 0; i < POOL_SIZE; i++) {
+		pool_free_list[i] = pool_buffer[i];
+	}
+	pool_free_count = POOL_SIZE;
+}
+
+static void *pool_alloc(void)
+{
+	if (pool_free_count == 0)
+		return NULL;
+	return pool_free_list[--pool_free_count];
+}
+
+static void pool_free(void *ptr)
+{
+	if (pool_free_count < POOL_SIZE)
+		pool_free_list[pool_free_count++] = ptr;
+}
+
+/* Benchmark 1: malloc + free (current behavior) */
+static void bench_malloc_free(void *arg)
+{
+	(void)arg;
+	void *p = malloc(ALLOC_SIZE);
+	BENCH_DONT_OPTIMIZE(p);
+	/* Touch memory to ensure it's allocated */
+	memset(p, 0, 64);
+	free(p);
+}
+
+/* Benchmark 2: Ring buffer slot reuse (proposed) */
+static void bench_ring_buffer(void *arg)
+{
+	(void)arg;
+	void *p = ring_buffer[ring_head & (RING_SIZE - 1)];
+	ring_head++;
+	BENCH_DONT_OPTIMIZE(p);
+	/* Touch memory like we would in real usage */
+	memset(p, 0, 64);
+	/* No free needed - slot is reused on wrap */
+}
+
+/* Benchmark 3: Pool allocator */
+static void bench_pool_alloc(void *arg)
+{
+	(void)arg;
+	void *p = pool_alloc();
+	BENCH_DONT_OPTIMIZE(p);
+	if (p) {
+		memset(p, 0, 64);
+		pool_free(p);
+	}
+}
+
+int main(void)
+{
+	struct bench_result r1, r2, r3;
+	const uint64_t iterations = 1000000;
+
+	printf("\n=== Per-Packet Allocation Benchmark ===\n");
+	printf("Allocation size: %d bytes (approx flow_pkt_list)\n", ALLOC_SIZE);
+
+	pool_init();
+
+	bench_report_header();
+
+	bench_run("malloc + free (current)", bench_malloc_free, NULL,
+	          iterations, &r1);
+	bench_report(&r1);
+
+	bench_run("ring buffer (proposed)", bench_ring_buffer, NULL,
+	          iterations, &r2);
+	bench_report(&r2);
+
+	bench_run("pool allocator (alternative)", bench_pool_alloc, NULL,
+	          iterations, &r3);
+	bench_report(&r3);
+
+	printf("\n--- Analysis ---\n");
+	printf("malloc/free cost:    %.1f cycles (%.1f ns)\n",
+	       r1.cycles_per_op, r1.ns_per_op);
+	printf("Ring buffer cost:    %.1f cycles (%.1f ns)\n",
+	       r2.cycles_per_op, r2.ns_per_op);
+	printf("Pool allocator cost: %.1f cycles (%.1f ns)\n",
+	       r3.cycles_per_op, r3.ns_per_op);
+	printf("Ring buffer savings: %.1f%% vs malloc\n",
+	       100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op);
+
+	return 0;
+}
diff --git a/deps/toptalk/bench_regression.c b/deps/toptalk/bench_regression.c
new file mode 100644
index 0000000..63bee9b
--- /dev/null
+++ b/deps/toptalk/bench_regression.c
@@ -0,0 +1,366 @@
+/*
+ * bench_regression.c - Performance regression tests
+ *
+ * Verifies that key operations meet minimum performance thresholds.
+ * Run with: make bench-regression && ./bench-regression
+ *
+ * Returns 0 if all tests pass, 1 if any test fails.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <pcap.h>
+
+#include "bench_common.h"
+#include "flow.h"
+#include "decode.h"
+#include "intervals.h"
+#include "uthash.h"
+
+/* Test thresholds (cycles per operation) - set conservatively high */
+#define DECODE_THRESHOLD_CYCLES      500    /* Header parsing */
+#define MALLOC_THRESHOLD_CYCLES      200    /* Allocation overhead */
+#define RINGBUF_THRESHOLD_CYCLES     100    /* Ring buffer ops including memset */
+#define TOPN_THRESHOLD_CYCLES       5000    /* Top-N selection for 100 flows */
+#define UPDATE_STATS_THRESHOLD      1000    /* Per-packet stats update (target: 200) */
+
+/* Test iteration counts - enough for stable measurements */
+#define DECODE_ITERATIONS     10000
+#define MALLOC_ITERATIONS    100000
+#define TOPN_ITERATIONS        1000
+#define UPDATE_STATS_ITERATIONS 100000
+
+static int tests_failed = 0;
+
+/* Synthetic TCP packet for decode test */
+static uint8_t test_packet[128];
+static struct pcap_pkthdr test_pkthdr;
+
+static void build_test_packet(void)
+{
+	memset(test_packet, 0, sizeof(test_packet));
+
+	/* Ethernet header (14 bytes) */
+	uint8_t *eth = test_packet;
+	eth[12] = 0x08;  /* EtherType: IPv4 (0x0800) */
+	eth[13] = 0x00;
+
+	/* IPv4 header (20 bytes) at offset 14 */
+	uint8_t *ip4 = test_packet + 14;
+	ip4[0] = 0x45;         /* Version 4, IHL 5 (20 bytes) */
+	ip4[1] = 0x00;         /* DSCP/ECN */
+	ip4[2] = 0x00;         /* Total length: 66 (20 IP + 20 TCP + 26 payload) */
+	ip4[3] = 0x42;
+	ip4[8] = 0x40;         /* TTL */
+	ip4[9] = 0x06;         /* Protocol: TCP */
+	/* Source IP: 10.0.0.1 */
+	ip4[12] = 10; ip4[13] = 0; ip4[14] = 0; ip4[15] = 1;
+	/* Dest IP: 10.0.0.2 */
+	ip4[16] = 10; ip4[17] = 0; ip4[18] = 0; ip4[19] = 2;
+
+	/* TCP header (20 bytes) at offset 34 */
+	uint8_t *tcp = test_packet + 34;
+	tcp[0] = 0x04;  /* Source port: 1234 */
+	tcp[1] = 0xD2;
+	tcp[2] = 0x00;  /* Dest port: 80 */
+	tcp[3] = 0x50;
+	/* Sequence number */
+	tcp[4] = 0x00; tcp[5] = 0x00; tcp[6] = 0x10; tcp[7] = 0x00;
+	/* Ack number */
+	tcp[8] = 0x00; tcp[9] = 0x00; tcp[10] = 0x20; tcp[11] = 0x00;
+	tcp[12] = 0x50;  /* Data offset: 5 (20 bytes), flags: 0 */
+	tcp[13] = 0x10;  /* ACK flag */
+	tcp[14] = 0xFF;  /* Window: 65535 */
+	tcp[15] = 0xFF;
+
+	/* Payload starts at offset 54 */
+	memset(test_packet + 54, 'A', 26);
+
+	/* Pcap header */
+	test_pkthdr.ts.tv_sec = 0;
+	test_pkthdr.ts.tv_usec = 0;
+	test_pkthdr.caplen = 80;
+	test_pkthdr.len = 80;
+}
+
+/*
+ * Test 1: Decode performance
+ */
+static void bench_decode_fn(void *arg)
+{
+	(void)arg;
+	struct flow_pkt pkt;
+	char errbuf[256];
+	int ret = decode_ethernet(&test_pkthdr, test_packet, &pkt, errbuf);
+	BENCH_DONT_OPTIMIZE(ret);
+	BENCH_DONT_OPTIMIZE(pkt.flow_rec.flow.proto);
+}
+
+static int test_decode_performance(void)
+{
+	struct bench_result r;
+
+	build_test_packet();
+	bench_run("decode_ethernet", bench_decode_fn, NULL, DECODE_ITERATIONS, &r);
+
+	printf("  decode: %.1f cycles/op (threshold: %d)\n",
+	       r.cycles_per_op, DECODE_THRESHOLD_CYCLES);
+
+	if (r.cycles_per_op > DECODE_THRESHOLD_CYCLES) {
+		printf("  FAIL: decode too slow\n");
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Test 2: Ring buffer performance (simulated)
+ */
+#define RING_SIZE 4096
+static uint8_t ring_buffer[RING_SIZE][600];
+static uint32_t ring_head = 0;
+
+static void bench_ringbuf_fn(void *arg)
+{
+	(void)arg;
+	void *p = ring_buffer[ring_head & (RING_SIZE - 1)];
+	ring_head++;
+	BENCH_DONT_OPTIMIZE(p);
+	memset(p, 0, 64);
+}
+
+static int test_ringbuf_performance(void)
+{
+	struct bench_result r;
+
+	ring_head = 0;
+	bench_run("ring_buffer", bench_ringbuf_fn, NULL, MALLOC_ITERATIONS, &r);
+
+	printf("  ring buffer: %.1f cycles/op (threshold: %d)\n",
+	       r.cycles_per_op, RINGBUF_THRESHOLD_CYCLES);
+
+	if (r.cycles_per_op > RINGBUF_THRESHOLD_CYCLES) {
+		printf("  FAIL: ring buffer too slow\n");
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Test 3: Top-N selection performance
+ */
+struct topn_flow_hash {
+	struct flow flow;
+	int64_t bytes;
+	UT_hash_handle hh;
+};
+
+static struct topn_flow_hash *topn_table = NULL;
+#define TOPN_FLOWS 100
+#define TOPN_N 10
+
+static struct topn_flow_hash *topn_result[TOPN_N];
+
+static void populate_topn_table(void)
+{
+	for (int i = 0; i < TOPN_FLOWS; i++) {
+		struct topn_flow_hash *entry = calloc(1, sizeof(*entry));
+		entry->flow.ethertype = 0x0800;
+		entry->flow.src_ip.s_addr = htonl(0x0a000001 + i);
+		entry->flow.dst_ip.s_addr = htonl(0x0a000100);
+		entry->flow.sport = 1024 + i;
+		entry->flow.dport = 80;
+		entry->bytes = rand() % 1000000;
+		HASH_ADD(hh, topn_table, flow, sizeof(struct flow), entry);
+	}
+}
+
+static void free_topn_table(void)
+{
+	struct topn_flow_hash *entry, *tmp;
+	HASH_ITER(hh, topn_table, entry, tmp) {
+		HASH_DEL(topn_table, entry);
+		free(entry);
+	}
+	topn_table = NULL;
+}
+
+static void find_top_n(void)
+{
+	struct topn_flow_hash *iter, *tmp;
+	int count = 0;
+
+	HASH_ITER(hh, topn_table, iter, tmp) {
+		if (count < TOPN_N) {
+			topn_result[count++] = iter;
+			for (int i = count - 1; i > 0; i--) {
+				if (topn_result[i]->bytes > topn_result[i-1]->bytes) {
+					struct topn_flow_hash *t = topn_result[i];
+					topn_result[i] = topn_result[i-1];
+					topn_result[i-1] = t;
+				}
+			}
+		} else if (iter->bytes > topn_result[TOPN_N-1]->bytes) {
+			topn_result[TOPN_N-1] = iter;
+			for (int i = TOPN_N - 1; i > 0; i--) {
+				if (topn_result[i]->bytes > topn_result[i-1]->bytes) {
+					struct topn_flow_hash *t = topn_result[i];
+					topn_result[i] = topn_result[i-1];
+					topn_result[i-1] = t;
+				}
+			}
+		}
+	}
+}
+
+static void bench_topn_fn(void *arg)
+{
+	(void)arg;
+	find_top_n();
+	BENCH_DONT_OPTIMIZE(topn_result[0]);
+}
+
+static int test_topn_performance(void)
+{
+	struct bench_result r;
+
+	srand(42);
+	populate_topn_table();
+
+	bench_run("top-N selection", bench_topn_fn, NULL, TOPN_ITERATIONS, &r);
+
+	free_topn_table();
+
+	printf("  top-N: %.1f cycles/op (threshold: %d)\n",
+	       r.cycles_per_op, TOPN_THRESHOLD_CYCLES);
+
+	if (r.cycles_per_op > TOPN_THRESHOLD_CYCLES) {
+		printf("  FAIL: top-N selection too slow\n");
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Test 4: Per-packet stats update (the real hot path)
+ *
+ * This measures the actual per-packet processing path:
+ * - Ring buffer insertion
+ * - Hash table lookup/insert
+ * - Interval delta accumulation
+ * - Expiration check
+ *
+ * Tests two scenarios:
+ * - Single flow (best case: hash hit)
+ * - Many flows (stress test: hash table growth)
+ */
+static struct flow_pkt bench_pkt;
+static uint32_t bench_pkt_counter = 0;
+
+static void bench_update_stats_single_flow_fn(void *arg)
+{
+	(void)arg;
+	/* Same flow every time - hash hit case */
+	bench_pkt.timestamp.tv_usec++;
+	if (bench_pkt.timestamp.tv_usec >= 1000000) {
+		bench_pkt.timestamp.tv_usec = 0;
+		bench_pkt.timestamp.tv_sec++;
+	}
+	tt_bench_update_stats(&bench_pkt);
+	BENCH_DONT_OPTIMIZE(bench_pkt.flow_rec.bytes);
+}
+
+static void bench_update_stats_many_flows_fn(void *arg)
+{
+	(void)arg;
+	/* Different flow each time up to 10K flows, then wrap */
+	bench_pkt_counter++;
+	bench_pkt.flow_rec.flow.sport = 1024 + (bench_pkt_counter % 10000);
+	bench_pkt.timestamp.tv_usec++;
+	if (bench_pkt.timestamp.tv_usec >= 1000000) {
+		bench_pkt.timestamp.tv_usec = 0;
+		bench_pkt.timestamp.tv_sec++;
+	}
+	tt_bench_update_stats(&bench_pkt);
+	BENCH_DONT_OPTIMIZE(bench_pkt.flow_rec.bytes);
+}
+
+static int test_update_stats_performance(void)
+{
+	struct bench_result r_single, r_many;
+
+	/* Initialize intervals subsystem */
+	if (tt_bench_init() != 0) {
+		printf("  FAIL: could not initialize benchmark\n");
+		return 1;
+	}
+
+	/* Setup test packet */
+	memset(&bench_pkt, 0, sizeof(bench_pkt));
+	bench_pkt.flow_rec.flow.ethertype = 0x0800;
+	bench_pkt.flow_rec.flow.src_ip.s_addr = htonl(0x0a000001);
+	bench_pkt.flow_rec.flow.dst_ip.s_addr = htonl(0x0a000002);
+	bench_pkt.flow_rec.flow.sport = 12345;
+	bench_pkt.flow_rec.flow.dport = 80;
+	bench_pkt.flow_rec.flow.proto = 6;  /* TCP */
+	bench_pkt.flow_rec.bytes = 1000;
+	bench_pkt.flow_rec.packets = 1;
+	bench_pkt.timestamp.tv_sec = 1000000;
+	bench_pkt.timestamp.tv_usec = 0;
+
+	/* Test single flow (hash hit case) */
+	bench_run("update_stats (1 flow)", bench_update_stats_single_flow_fn,
+	          NULL, UPDATE_STATS_ITERATIONS, &r_single);
+
+	/* Cleanup and reinit for many-flow test */
+	tt_bench_cleanup();
+	tt_bench_init();
+	bench_pkt_counter = 0;
+	bench_pkt.timestamp.tv_sec = 1000000;
+	bench_pkt.timestamp.tv_usec = 0;
+
+	/* Test many flows (hash table stress) */
+	bench_run("update_stats (10K flows)", bench_update_stats_many_flows_fn,
+	          NULL, UPDATE_STATS_ITERATIONS, &r_many);
+
+	tt_bench_cleanup();
+
+	printf("  single flow: %.1f cycles/op\n", r_single.cycles_per_op);
+	printf("  10K flows:   %.1f cycles/op\n", r_many.cycles_per_op);
+	printf("  threshold:   %d cycles/op\n", UPDATE_STATS_THRESHOLD);
+
+	if (r_single.cycles_per_op > UPDATE_STATS_THRESHOLD) {
+		printf("  FAIL: single flow update too slow\n");
+		return 1;
+	}
+	if (r_many.cycles_per_op > UPDATE_STATS_THRESHOLD * 2) {
+		printf("  FAIL: many flow update too slow\n");
+		return 1;
+	}
+	return 0;
+}
+
+int main(void)
+{
+	printf("\n=== Performance Regression Tests ===\n\n");
+
+	printf("Test 1: Header decoding\n");
+	tests_failed += test_decode_performance();
+
+	printf("\nTest 2: Ring buffer allocation\n");
+	tests_failed += test_ringbuf_performance();
+
+	printf("\nTest 3: Top-N flow selection\n");
+	tests_failed += test_topn_performance();
+
+	printf("\nTest 4: Per-packet stats update\n");
+	tests_failed += test_update_stats_performance();
+
+	printf("\n=== Results: %d test(s) failed ===\n",
+	       tests_failed);
+
+	return tests_failed > 0 ? 1 : 0;
+}
diff --git a/deps/toptalk/bench_rotation.c b/deps/toptalk/bench_rotation.c
new file mode 100644
index 0000000..f3cd882
--- /dev/null
+++ b/deps/toptalk/bench_rotation.c
@@ -0,0 +1,157 @@
+/*
+ * bench_rotation.c - Benchmark for interval table rotation
+ *
+ * Measures the cost of:
+ * 1. Copy-based rotation (current clear_table behavior)
+ * 2. Pointer swap rotation (proposed optimization)
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "bench_common.h"
+#include "uthash.h"
+#include "flow.h"
+
+/* Simplified flow hash entry for benchmarking */
+struct bench_flow_hash {
+	struct flow flow;
+	int64_t bytes;
+	int64_t packets;
+	UT_hash_handle hh;
+};
+
+/* Tables for copy-based benchmark */
+static struct bench_flow_hash *copy_incomplete = NULL;
+static struct bench_flow_hash *copy_complete = NULL;
+
+/* Tables for swap-based benchmark */
+static struct bench_flow_hash *swap_tables[2] = {NULL, NULL};
+static int swap_write_idx = 0;
+
+/* Number of flows to test with */
+#define NUM_FLOWS 100
+
+/* Populate a table with NUM_FLOWS entries */
+static void populate_table(struct bench_flow_hash **table)
+{
+	for (int i = 0; i < NUM_FLOWS; i++) {
+		struct bench_flow_hash *entry = calloc(1, sizeof(*entry));
+		entry->flow.ethertype = 0x0800;
+		entry->flow.src_ip.s_addr = htonl(0x0a000001 + i);
+		entry->flow.dst_ip.s_addr = htonl(0x0a000100);
+		entry->flow.sport = 1024 + i;
+		entry->flow.dport = 80;
+		entry->bytes = 1000 * (i + 1);
+		entry->packets = 10 * (i + 1);
+		HASH_ADD(hh, *table, flow, sizeof(struct flow), entry);
+	}
+}
+
+/* Free all entries in a table */
+static void free_table(struct bench_flow_hash **table)
+{
+	struct bench_flow_hash *entry, *tmp;
+	HASH_ITER(hh, *table, entry, tmp) {
+		HASH_DEL(*table, entry);
+		free(entry);
+	}
+	*table = NULL;
+}
+
+/* Copy-based rotation (current behavior) */
+static void rotate_copy(void)
+{
+	struct bench_flow_hash *entry, *tmp;
+
+	/* Free old complete table */
+	free_table(&copy_complete);
+
+	/* Copy incomplete to complete */
+	HASH_ITER(hh, copy_incomplete, entry, tmp) {
+		struct bench_flow_hash *n = malloc(sizeof(*n));
+		memcpy(n, entry, sizeof(*n));
+		memset(&n->hh, 0, sizeof(n->hh));
+		HASH_ADD(hh, copy_complete, flow, sizeof(struct flow), n);
+	}
+
+	/* Free incomplete */
+	free_table(&copy_incomplete);
+}
+
+/* Swap-based rotation (proposed) */
+static void rotate_swap(void)
+{
+	int read_idx = 1 - swap_write_idx;
+
+	/* Free old read table */
+	free_table(&swap_tables[read_idx]);
+
+	/* Swap: write becomes read, allocate new write */
+	swap_tables[read_idx] = swap_tables[swap_write_idx];
+	swap_tables[swap_write_idx] = NULL;
+
+	/* Flip indices */
+	swap_write_idx = read_idx;
+}
+
+/* Benchmark 1: Copy-based rotation */
+static void bench_copy_rotation(void *arg)
+{
+	(void)arg;
+	/* Re-populate incomplete table for each iteration */
+	populate_table(&copy_incomplete);
+	rotate_copy();
+}
+
+/* Benchmark 2: Swap-based rotation */
+static void bench_swap_rotation(void *arg)
+{
+	(void)arg;
+	/* Re-populate write table for each iteration */
+	populate_table(&swap_tables[swap_write_idx]);
+	rotate_swap();
+}
+
+int main(void)
+{
+	struct bench_result r1, r2;
+	const uint64_t iterations = 1000;
+
+	printf("\n=== Interval Table Rotation Benchmark ===\n");
+	printf("Flows per table: %d\n", NUM_FLOWS);
+
+	bench_report_header();
+
+	bench_run("copy-based rotation (current)", bench_copy_rotation, NULL,
+	          iterations, &r1);
+	bench_report(&r1);
+
+	/* Reset for swap benchmark */
+	free_table(&swap_tables[0]);
+	free_table(&swap_tables[1]);
+	swap_write_idx = 0;
+
+	bench_run("swap-based rotation (proposed)", bench_swap_rotation, NULL,
+	          iterations, &r2);
+	bench_report(&r2);
+
+	printf("\n--- Analysis ---\n");
+	printf("Copy rotation:  %.1f cycles (%.1f ns) per rotation\n",
+	       r1.cycles_per_op, r1.ns_per_op);
+	printf("Swap rotation:  %.1f cycles (%.1f ns) per rotation\n",
+	       r2.cycles_per_op, r2.ns_per_op);
+	printf("Improvement:    %.1f%% fewer cycles\n",
+	       100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op);
+
+	/* Cleanup */
+	free_table(&copy_incomplete);
+	free_table(&copy_complete);
+	free_table(&swap_tables[0]);
+	free_table(&swap_tables[1]);
+
+	return 0;
+}
diff --git a/deps/toptalk/bench_sort.c b/deps/toptalk/bench_sort.c
new file mode 100644
index 0000000..2a81b2e
--- /dev/null
+++ b/deps/toptalk/bench_sort.c
@@ -0,0 +1,181 @@
+/*
+ * bench_sort.c - Benchmark for flow sorting
+ *
+ * Measures the cost of:
+ * 1. HASH_SRT (full sort every time - current behavior)
+ * 2. Partial sort / selection algorithm (proposed)
+ * 3. Incremental heap tracking (alternative)
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include "bench_common.h"
+#include "uthash.h"
+#include "flow.h"
+
+/* Flow hash entry for sorting benchmarks */
+struct sort_flow_hash {
+	struct flow flow;
+	int64_t bytes;
+	int64_t packets;
+	UT_hash_handle hh;
+};
+
+/* Tables for different sorting approaches */
+static struct sort_flow_hash *hash_srt_table = NULL;
+static struct sort_flow_hash *partial_sort_table = NULL;
+
+/* Number of flows and top-N to extract */
+#define NUM_FLOWS 100
+#define TOP_N 10
+
+/* Comparison function for uthash HASH_SRT */
+static int bytes_cmp(struct sort_flow_hash *a, struct sort_flow_hash *b)
+{
+	/* Descending order */
+	return (b->bytes > a->bytes) - (b->bytes < a->bytes);
+}
+
+/* Populate a table with randomized byte counts */
+static void populate_sort_table(struct sort_flow_hash **table, unsigned int seed)
+{
+	srand(seed);
+	for (int i = 0; i < NUM_FLOWS; i++) {
+		struct sort_flow_hash *entry = calloc(1, sizeof(*entry));
+		entry->flow.ethertype = 0x0800;
+		entry->flow.src_ip.s_addr = htonl(0x0a000001 + i);
+		entry->flow.dst_ip.s_addr = htonl(0x0a000100);
+		entry->flow.sport = 1024 + i;
+		entry->flow.dport = 80;
+		entry->bytes = rand() % 1000000;
+		entry->packets = entry->bytes / 100;
+		HASH_ADD(hh, *table, flow, sizeof(struct flow), entry);
+	}
+}
+
+/* Free all entries in a table */
+static void free_sort_table(struct sort_flow_hash **table)
+{
+	struct sort_flow_hash *entry, *tmp;
+	HASH_ITER(hh, *table, entry, tmp) {
+		HASH_DEL(*table, entry);
+		free(entry);
+	}
+	*table = NULL;
+}
+
+/* Partial sort: find top N using selection algorithm */
+static void find_top_n(struct sort_flow_hash *table,
+                       struct sort_flow_hash **top,
+                       int n)
+{
+	/* Simple O(n) approach: track top N in an array */
+	int count = 0;
+	struct sort_flow_hash *entry, *tmp;
+
+	HASH_ITER(hh, table, entry, tmp) {
+		if (count < n) {
+			/* Fill top array first */
+			top[count++] = entry;
+			/* Keep sorted with insertion sort (small array) */
+			for (int i = count - 1; i > 0; i--) {
+				if (top[i]->bytes > top[i-1]->bytes) {
+					struct sort_flow_hash *t = top[i];
+					top[i] = top[i-1];
+					top[i-1] = t;
+				}
+			}
+		} else if (entry->bytes > top[n-1]->bytes) {
+			/* Replace smallest in top */
+			top[n-1] = entry;
+			/* Re-sort */
+			for (int i = n - 1; i > 0; i--) {
+				if (top[i]->bytes > top[i-1]->bytes) {
+					struct sort_flow_hash *t = top[i];
+					top[i] = top[i-1];
+					top[i-1] = t;
+				}
+			}
+		}
+	}
+}
+
+/* Benchmark context */
+struct sort_bench_ctx {
+	struct sort_flow_hash *top[TOP_N];
+};
+
+static struct sort_bench_ctx ctx;
+
+/* Benchmark 1: HASH_SRT (current behavior) */
+static void bench_hash_srt(void *arg)
+{
+	(void)arg;
+	HASH_SRT(hh, hash_srt_table, bytes_cmp);
+
+	/* Iterate to get top N (what the real code does) */
+	struct sort_flow_hash *entry = hash_srt_table;
+	for (int i = 0; i < TOP_N && entry; i++) {
+		BENCH_DONT_OPTIMIZE(entry->bytes);
+		entry = entry->hh.next;
+	}
+}
+
+/* Benchmark 2: Partial sort / selection */
+static void bench_partial_sort(void *arg)
+{
+	struct sort_bench_ctx *c = arg;
+	find_top_n(partial_sort_table, c->top, TOP_N);
+
+	for (int i = 0; i < TOP_N; i++) {
+		BENCH_DONT_OPTIMIZE(c->top[i]->bytes);
+	}
+}
+
+int main(void)
+{
+	struct bench_result r1, r2;
+	const uint64_t iterations = 1000;
+
+	printf("\n=== Flow Sorting Benchmark ===\n");
+	printf("Total flows: %d, extracting top %d\n", NUM_FLOWS, TOP_N);
+
+	/* Create tables with same random data */
+	populate_sort_table(&hash_srt_table, 42);
+	populate_sort_table(&partial_sort_table, 42);
+
+	bench_report_header();
+
+	bench_run("HASH_SRT full sort (current)", bench_hash_srt, NULL,
+	          iterations, &r1);
+	bench_report(&r1);
+
+	bench_run("partial sort top-N (proposed)", bench_partial_sort, &ctx,
+	          iterations, &r2);
+	bench_report(&r2);
+
+	printf("\n--- Analysis ---\n");
+	printf("HASH_SRT cost:    %.1f cycles (%.1f ns) per sort\n",
+	       r1.cycles_per_op, r1.ns_per_op);
+	printf("Partial sort:     %.1f cycles (%.1f ns) per sort\n",
+	       r2.cycles_per_op, r2.ns_per_op);
+	printf("Improvement:      %.1f%% fewer cycles\n",
+	       100.0 * (r1.cycles_per_op - r2.cycles_per_op) / r1.cycles_per_op);
+
+	/* Note: At 1ms tick rate, sorting happens 1000x/sec */
+	printf("\nAt 1000 sorts/sec:\n");
+	printf("  HASH_SRT:     %.0f us/sec overhead\n",
+	       r1.ns_per_op * 1000 / 1000.0);
+	printf("  Partial sort: %.0f us/sec overhead\n",
+	       r2.ns_per_op * 1000 / 1000.0);
+
+	/* Cleanup */
+	free_sort_table(&hash_srt_table);
+	free_sort_table(&partial_sort_table);
+
+	return 0;
+}
diff --git a/deps/toptalk/intervals.c b/deps/toptalk/intervals.c
index a2d33db..3182417 100644
--- a/deps/toptalk/intervals.c
+++ b/deps/toptalk/intervals.c
@@ -620,6 +620,70 @@ static void update_stats_tables(struct flow_pkt *pkt)
 	}
 }
 
+/*
+ * Benchmark hooks: expose internals for performance testing.
+ */
+
+/* Initialize just the data structures needed for benchmarking (no pcap) */
+int tt_bench_init(void)
+{
+	ref_window_size = (struct timeval){.tv_sec = 3, .tv_usec = 0 };
+	flow_ref_table = NULL;
+
+	/* Allocate ring buffer */
+	if (pkt_ring.entries == NULL) {
+		if (ring_buffer_alloc(TT_DEFAULT_RING_SIZE) != 0) {
+			return 1;
+		}
+	} else {
+		pkt_ring.head = 0;
+		pkt_ring.tail = 0;
+	}
+	last_pkt_time = (struct timeval){ 0 };
+
+	/* Initialize flow pools */
+	if (ref_flow_pool.entries == NULL) {
+		if (flow_pool_init(&ref_flow_pool, FLOW_POOL_SIZE) != 0) {
+			ring_buffer_free();
+			return 1;
+		}
+	}
+	if (interval_flow_pool.entries == NULL) {
+		if (flow_pool_init(&interval_flow_pool, FLOW_POOL_SIZE * 2) != 0) {
+			flow_pool_cleanup(&ref_flow_pool);
+			ring_buffer_free();
+			return 1;
+		}
+	}
+
+	totals.bytes = 0;
+	totals.packets = 0;
+
+	return 0;
+}
+
+/* Cleanup benchmark resources */
+void tt_bench_cleanup(void)
+{
+	/* Clear flow tables */
+	struct flow_hash *iter, *tmp;
+	HASH_ITER(r_hh, flow_ref_table, iter, tmp) {
+		HASH_DELETE(r_hh, flow_ref_table, iter);
+		flow_pool_free(&ref_flow_pool, iter);
+	}
+	flow_ref_table = NULL;
+
+	ring_buffer_free();
+	flow_pool_cleanup(&ref_flow_pool);
+	flow_pool_cleanup(&interval_flow_pool);
+}
+
+/* Process a packet through the stats update path */
+void tt_bench_update_stats(struct flow_pkt *pkt)
+{
+	update_stats_tables(pkt);
+}
+
 #define DEBUG 1
 #if DEBUG
 static void dbg_per_second(struct tt_top_flows *t5)
diff --git a/deps/toptalk/intervals.h b/deps/toptalk/intervals.h
index bdb1c31..a433f03 100644
--- a/deps/toptalk/intervals.h
+++ b/deps/toptalk/intervals.h
@@ -68,4 +68,19 @@ void tt_set_rtp_forward_callback(tt_rtp_forward_cb cb);
                 exit(EXIT_FAILURE);                                            \
         } while (0)
 
+/*
+ * Benchmark hooks for performance testing.
+ * These expose internal functions without pcap/thread overhead.
+ */
+
+/* Initialize data structures for benchmarking (no pcap needed) */
+int tt_bench_init(void);
+
+/* Cleanup after benchmarking */
+void tt_bench_cleanup(void);
+
+/* Process a decoded packet through the stats update path.
+ * This is the hot path that runs for every captured packet. */
+void tt_bench_update_stats(struct flow_pkt *pkt);
+
 #endif