diff --git a/benchmarks/pandas/bench_cat_accessor.py b/benchmarks/pandas/bench_cat_accessor.py new file mode 100644 index 00000000..ef766cc1 --- /dev/null +++ b/benchmarks/pandas/bench_cat_accessor.py @@ -0,0 +1,32 @@ +import json +import time +import pandas as pd + +N = 50_000 +CATS = ["alpha", "beta", "gamma", "delta", "epsilon"] +data = [CATS[i % len(CATS)] for i in range(N)] +s = pd.Categorical(data, categories=CATS) +series = pd.Series(s) + +# Warm-up +for _ in range(10): + _ = series.cat.categories + _ = series.cat.codes + series.cat.add_categories(["zeta"]) + series.cat.remove_unused_categories() + +iterations = 100 +start = time.perf_counter() +for _ in range(iterations): + _ = series.cat.categories + _ = series.cat.codes + series.cat.add_categories(["zeta"]) + series.cat.remove_unused_categories() +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cat_accessor", + "mean_ms": total_ms / iterations, + "iterations": iterations, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_hash_array.py b/benchmarks/pandas/bench_hash_array.py new file mode 100644 index 00000000..11da4b97 --- /dev/null +++ b/benchmarks/pandas/bench_hash_array.py @@ -0,0 +1,23 @@ +import pandas as pd +import time +import json + +N = 100_000 +arr = [None if i % 10 == 0 else f"str_{i}" if i % 3 == 0 else i for i in range(N)] + +# Warm-up +for _ in range(5): + pd.util.hash_array(arr) + +ITERS = 20 +start = time.perf_counter() +for _ in range(ITERS): + pd.util.hash_array(arr) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "hash_array", + "mean_ms": total / ITERS, + "iterations": ITERS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_hash_biject_array.py b/benchmarks/pandas/bench_hash_biject_array.py new file mode 100644 index 00000000..78d83bde --- /dev/null +++ b/benchmarks/pandas/bench_hash_biject_array.py @@ -0,0 +1,40 @@ +import json +import time + +N = 50_000 +data = [f"label_{i % 1000}" if i % 2 == 0 else i % 1000 for i in range(N)] + +def hash_biject_array(arr): + """Assign a stable integer code to each unique value.""" + mapping = {} + codes = [] + next_code = 0 + for v in arr: + k = (type(v).__name__, v) + if k not in mapping: + mapping[k] = next_code + next_code += 1 + codes.append(mapping[k]) + return codes, {v: k for k, v in mapping.items()} + +def hash_biject_inverse(codes, inverse_map): + return [inverse_map[c] for c in codes] + +# Warm-up +for _ in range(10): + codes, inv = hash_biject_array(data) + hash_biject_inverse(codes, inv) + +iterations = 50 +start = time.perf_counter() +for _ in range(iterations): + codes, inv = hash_biject_array(data) + hash_biject_inverse(codes, inv) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "hash_biject_array", + "mean_ms": total_ms / iterations, + "iterations": iterations, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_indexers.py b/benchmarks/pandas/bench_indexers.py new file mode 100644 index 00000000..4f35085b --- /dev/null +++ b/benchmarks/pandas/bench_indexers.py @@ -0,0 +1,30 @@ +"""Benchmark: FixedForwardWindowIndexer and VariableOffsetWindowIndexer on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +offsets = np.array([3 + (i % 5) for i in range(ROWS)], dtype=np.int32) + +fixed_idx = pd.api.indexers.FixedForwardWindowIndexer(window_size=10) +var_idx = pd.api.indexers.VariableOffsetWindowIndexer(index=pd.date_range("2020", periods=ROWS, freq="D"), offset=pd.offsets.Day(3)) + +# Warm up with just fixed (VariableOffsetWindowIndexer needs DatetimeIndex in pandas) +for _ in range(WARMUP): + fixed_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None) + +start = time.perf_counter() +for _ in range(ITERATIONS): + fixed_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None) + var_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "indexers", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_scalar_extract.py b/benchmarks/pandas/bench_scalar_extract.py new file mode 100644 index 00000000..0be1bc98 --- /dev/null +++ b/benchmarks/pandas/bench_scalar_extract.py @@ -0,0 +1,45 @@ +"""Benchmark: scalar extraction utilities (squeeze, first_valid_index, last_valid_index) on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +# Series with some leading/trailing nulls +data = [None if (i < 100 or i >= ROWS - 100) else i * 0.1 for i in range(ROWS)] +s = pd.Series(data) +s1 = pd.Series([42]) + +# DataFrame with some nulls +col_a = [None if i < 50 else i * 1.0 for i in range(ROWS)] +col_b = [None if i >= ROWS - 50 else i * 2.0 for i in range(ROWS)] +df = pd.DataFrame({"A": col_a, "B": col_b}) +df1col = pd.DataFrame({"A": col_a}) + +# Warm up +for _ in range(WARMUP): + s.first_valid_index() + s.last_valid_index() + df.first_valid_index() + df.last_valid_index() + s1.squeeze() + df1col.squeeze(axis=1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.first_valid_index() + s.last_valid_index() + df.first_valid_index() + df.last_valid_index() + s1.squeeze() + df1col.squeeze(axis=1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "scalar_extract", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_swaplevel.py b/benchmarks/pandas/bench_swaplevel.py new file mode 100644 index 00000000..fe8737b1 --- /dev/null +++ b/benchmarks/pandas/bench_swaplevel.py @@ -0,0 +1,30 @@ +import pandas as pd +import numpy as np +import time +import json + +N = 50_000 +lev_a = [f"a{i % 100}" for i in range(N)] +lev_b = [i % 500 for i in range(N)] +lev_c = [i % 10 for i in range(N)] +idx = pd.MultiIndex.from_arrays([lev_a, lev_b, lev_c]) +s = pd.Series(range(N), index=idx) + +# Warm-up +for _ in range(3): + s.swaplevel(0, 1) + s.reorder_levels([2, 0, 1]) + +ITERS = 20 +start = time.perf_counter() +for _ in range(ITERS): + s.swaplevel(0, 1) + s.reorder_levels([2, 0, 1]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "swaplevel", + "mean_ms": total / ITERS, + "iterations": ITERS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_cat_accessor.ts b/benchmarks/tsb/bench_cat_accessor.ts new file mode 100644 index 00000000..cc3c08fb --- /dev/null +++ b/benchmarks/tsb/bench_cat_accessor.ts @@ -0,0 +1,34 @@ +import { Series, CategoricalAccessor } from "../../src/index.js"; + +const N = 50_000; +const CATS = ["alpha", "beta", "gamma", "delta", "epsilon"] as const; +const data: string[] = Array.from({ length: N }, (_, i) => CATS[i % CATS.length]); +const s = new Series({ data }); +const acc = new CategoricalAccessor(s); + +// Warm-up +for (let i = 0; i < 10; i++) { + acc.categories; + acc.codes; + acc.addCategories(["zeta"]); + acc.removeUnusedCategories(); +} + +const iterations = 100; +const start = performance.now(); +for (let i = 0; i < iterations; i++) { + acc.categories; + acc.codes; + acc.addCategories(["zeta"]); + acc.removeUnusedCategories(); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cat_accessor", + mean_ms: total_ms / iterations, + iterations, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_hash_array.ts b/benchmarks/tsb/bench_hash_array.ts new file mode 100644 index 00000000..24ed81d3 --- /dev/null +++ b/benchmarks/tsb/bench_hash_array.ts @@ -0,0 +1,23 @@ +import { hashArray } from "../../src/index.js"; + +const N = 100_000; +const arr: (string | number | null)[] = Array.from({ length: N }, (_, i) => + i % 10 === 0 ? null : i % 3 === 0 ? `str_${i}` : i, +); + +// Warm-up +for (let i = 0; i < 5; i++) hashArray(arr); + +const ITERS = 20; +const start = performance.now(); +for (let i = 0; i < ITERS; i++) hashArray(arr); +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "hash_array", + mean_ms: total / ITERS, + iterations: ITERS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_hash_biject_array.ts b/benchmarks/tsb/bench_hash_biject_array.ts new file mode 100644 index 00000000..d0be9916 --- /dev/null +++ b/benchmarks/tsb/bench_hash_biject_array.ts @@ -0,0 +1,29 @@ +import { hashBijectArray, hashBijectInverse } from "../../src/index.js"; + +const N = 50_000; +const data: (string | number)[] = Array.from({ length: N }, (_, i) => + i % 2 === 0 ? `label_${i % 1000}` : i % 1000, +); + +// Warm-up +for (let i = 0; i < 10; i++) { + const codes = hashBijectArray(data); + hashBijectInverse(codes); +} + +const iterations = 50; +const start = performance.now(); +for (let i = 0; i < iterations; i++) { + const codes = hashBijectArray(data); + hashBijectInverse(codes); +} +const total_ms = performance.now() - start; + +console.log( + JSON.stringify({ + function: "hash_biject_array", + mean_ms: total_ms / iterations, + iterations, + total_ms, + }), +); diff --git a/benchmarks/tsb/bench_indexers.ts b/benchmarks/tsb/bench_indexers.ts new file mode 100644 index 00000000..a62fe5ed --- /dev/null +++ b/benchmarks/tsb/bench_indexers.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: FixedForwardWindowIndexer and VariableOffsetWindowIndexer on 100k rows + */ +import { FixedForwardWindowIndexer, VariableOffsetWindowIndexer } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const offsets = Int32Array.from({ length: ROWS }, (_, i) => 3 + (i % 5)); + +const fixedIdx = new FixedForwardWindowIndexer({ windowSize: 10 }); +const varIdx = new VariableOffsetWindowIndexer({ indexArray: offsets }); + +// Warm up +for (let i = 0; i < WARMUP; i++) { + fixedIdx.getWindowBounds(ROWS); + varIdx.getWindowBounds(ROWS); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + fixedIdx.getWindowBounds(ROWS); + varIdx.getWindowBounds(ROWS); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "indexers", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_scalar_extract.ts b/benchmarks/tsb/bench_scalar_extract.ts new file mode 100644 index 00000000..32cfbd93 --- /dev/null +++ b/benchmarks/tsb/bench_scalar_extract.ts @@ -0,0 +1,53 @@ +/** + * Benchmark: scalar extraction utilities (squeeze, firstValidIndex, lastValidIndex) on 100k rows + */ +import { DataFrame, Index, Series, squeezeSeries, squeezeDataFrame, firstValidIndex, lastValidIndex, dataFrameFirstValidIndex, dataFrameLastValidIndex } from "../../src/index.js"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Series with some leading/trailing nulls +const data: (number | null)[] = Array.from({ length: ROWS }, (_, i) => + i < 100 || i >= ROWS - 100 ? null : i * 0.1, +); +const s = new Series(data); +const s1 = new Series([42]); + +// DataFrame with some nulls +const colA: (number | null)[] = Array.from({ length: ROWS }, (_, i) => (i < 50 ? null : i * 1.0)); +const colB: (number | null)[] = Array.from({ length: ROWS }, (_, i) => + i >= ROWS - 50 ? null : i * 2.0, +); +const df = new DataFrame({ A: colA, B: colB }); +const df1col = new DataFrame({ A: colA }); + +// Warm up +for (let i = 0; i < WARMUP; i++) { + firstValidIndex(s); + lastValidIndex(s); + dataFrameFirstValidIndex(df); + dataFrameLastValidIndex(df); + squeezeSeries(s1); + squeezeDataFrame(df1col, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + firstValidIndex(s); + lastValidIndex(s); + dataFrameFirstValidIndex(df); + dataFrameLastValidIndex(df); + squeezeSeries(s1); + squeezeDataFrame(df1col, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "scalar_extract", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_swaplevel.ts b/benchmarks/tsb/bench_swaplevel.ts new file mode 100644 index 00000000..627b8fe6 --- /dev/null +++ b/benchmarks/tsb/bench_swaplevel.ts @@ -0,0 +1,32 @@ +import { MultiIndex, Series, swapLevelSeries, reorderLevelsSeries } from "../../src/index.js"; + +const N = 50_000; +const levA = Array.from({ length: N }, (_, i) => `a${i % 100}`); +const levB = Array.from({ length: N }, (_, i) => i % 500); +const levC = Array.from({ length: N }, (_, i) => i % 10); +const tuples: [string, number, number][] = levA.map((v, i) => [v, levB[i], levC[i]]); +const idx = new MultiIndex({ tuples }); +const s = new Series({ data: Array.from({ length: N }, (_, i) => i), index: idx }); + +// Warm-up +for (let i = 0; i < 3; i++) { + swapLevelSeries(s, 0, 1); + reorderLevelsSeries(s, [2, 0, 1]); +} + +const ITERS = 20; +const start = performance.now(); +for (let i = 0; i < ITERS; i++) { + swapLevelSeries(s, 0, 1); + reorderLevelsSeries(s, [2, 0, 1]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "swaplevel", + mean_ms: total / ITERS, + iterations: ITERS, + total_ms: total, + }), +);