Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions benchmarks/pandas/bench_cat_accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
import time
import pandas as pd

N = 50_000
CATS = ["alpha", "beta", "gamma", "delta", "epsilon"]
data = [CATS[i % len(CATS)] for i in range(N)]
s = pd.Categorical(data, categories=CATS)
series = pd.Series(s)

# Warm-up
for _ in range(10):
_ = series.cat.categories
_ = series.cat.codes
series.cat.add_categories(["zeta"])
series.cat.remove_unused_categories()

iterations = 100
start = time.perf_counter()
for _ in range(iterations):
_ = series.cat.categories
_ = series.cat.codes
series.cat.add_categories(["zeta"])
series.cat.remove_unused_categories()
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cat_accessor",
"mean_ms": total_ms / iterations,
"iterations": iterations,
"total_ms": total_ms,
}))
23 changes: 23 additions & 0 deletions benchmarks/pandas/bench_hash_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd
import time
import json

N = 100_000
arr = [None if i % 10 == 0 else f"str_{i}" if i % 3 == 0 else i for i in range(N)]

# Warm-up
for _ in range(5):
pd.util.hash_array(arr)

ITERS = 20
start = time.perf_counter()
for _ in range(ITERS):
pd.util.hash_array(arr)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "hash_array",
"mean_ms": total / ITERS,
"iterations": ITERS,
"total_ms": total,
}))
40 changes: 40 additions & 0 deletions benchmarks/pandas/bench_hash_biject_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
import time

N = 50_000
data = [f"label_{i % 1000}" if i % 2 == 0 else i % 1000 for i in range(N)]

def hash_biject_array(arr):
"""Assign a stable integer code to each unique value."""
mapping = {}
codes = []
next_code = 0
for v in arr:
k = (type(v).__name__, v)
if k not in mapping:
mapping[k] = next_code
next_code += 1
codes.append(mapping[k])
return codes, {v: k for k, v in mapping.items()}

def hash_biject_inverse(codes, inverse_map):
return [inverse_map[c] for c in codes]

# Warm-up
for _ in range(10):
codes, inv = hash_biject_array(data)
hash_biject_inverse(codes, inv)

iterations = 50
start = time.perf_counter()
for _ in range(iterations):
codes, inv = hash_biject_array(data)
hash_biject_inverse(codes, inv)
total_ms = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "hash_biject_array",
"mean_ms": total_ms / iterations,
"iterations": iterations,
"total_ms": total_ms,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_indexers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: FixedForwardWindowIndexer and VariableOffsetWindowIndexer on 100k rows"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

offsets = np.array([3 + (i % 5) for i in range(ROWS)], dtype=np.int32)

fixed_idx = pd.api.indexers.FixedForwardWindowIndexer(window_size=10)
var_idx = pd.api.indexers.VariableOffsetWindowIndexer(index=pd.date_range("2020", periods=ROWS, freq="D"), offset=pd.offsets.Day(3))

# Warm up with just fixed (VariableOffsetWindowIndexer needs DatetimeIndex in pandas)
for _ in range(WARMUP):
fixed_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None)

start = time.perf_counter()
for _ in range(ITERATIONS):
fixed_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None)
var_idx.get_window_bounds(ROWS, min_periods=1, center=False, closed=None)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "indexers",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
45 changes: 45 additions & 0 deletions benchmarks/pandas/bench_scalar_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Benchmark: scalar extraction utilities (squeeze, first_valid_index, last_valid_index) on 100k rows"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

# Series with some leading/trailing nulls
data = [None if (i < 100 or i >= ROWS - 100) else i * 0.1 for i in range(ROWS)]
s = pd.Series(data)
s1 = pd.Series([42])

# DataFrame with some nulls
col_a = [None if i < 50 else i * 1.0 for i in range(ROWS)]
col_b = [None if i >= ROWS - 50 else i * 2.0 for i in range(ROWS)]
df = pd.DataFrame({"A": col_a, "B": col_b})
df1col = pd.DataFrame({"A": col_a})

# Warm up
for _ in range(WARMUP):
s.first_valid_index()
s.last_valid_index()
df.first_valid_index()
df.last_valid_index()
s1.squeeze()
df1col.squeeze(axis=1)

start = time.perf_counter()
for _ in range(ITERATIONS):
s.first_valid_index()
s.last_valid_index()
df.first_valid_index()
df.last_valid_index()
s1.squeeze()
df1col.squeeze(axis=1)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "scalar_extract",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_swaplevel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas as pd
import numpy as np
import time
import json

N = 50_000
lev_a = [f"a{i % 100}" for i in range(N)]
lev_b = [i % 500 for i in range(N)]
lev_c = [i % 10 for i in range(N)]
idx = pd.MultiIndex.from_arrays([lev_a, lev_b, lev_c])
s = pd.Series(range(N), index=idx)

# Warm-up
for _ in range(3):
s.swaplevel(0, 1)
s.reorder_levels([2, 0, 1])

ITERS = 20
start = time.perf_counter()
for _ in range(ITERS):
s.swaplevel(0, 1)
s.reorder_levels([2, 0, 1])
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "swaplevel",
"mean_ms": total / ITERS,
"iterations": ITERS,
"total_ms": total,
}))
34 changes: 34 additions & 0 deletions benchmarks/tsb/bench_cat_accessor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { Series, CategoricalAccessor } from "../../src/index.js";

const N = 50_000;
const CATS = ["alpha", "beta", "gamma", "delta", "epsilon"] as const;
const data: string[] = Array.from({ length: N }, (_, i) => CATS[i % CATS.length]);
const s = new Series({ data });
const acc = new CategoricalAccessor(s);

// Warm-up
for (let i = 0; i < 10; i++) {
acc.categories;
acc.codes;
acc.addCategories(["zeta"]);
acc.removeUnusedCategories();
}

const iterations = 100;
const start = performance.now();
for (let i = 0; i < iterations; i++) {
acc.categories;
acc.codes;
acc.addCategories(["zeta"]);
acc.removeUnusedCategories();
}
const total_ms = performance.now() - start;

console.log(
JSON.stringify({
function: "cat_accessor",
mean_ms: total_ms / iterations,
iterations,
total_ms,
}),
);
23 changes: 23 additions & 0 deletions benchmarks/tsb/bench_hash_array.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { hashArray } from "../../src/index.js";

const N = 100_000;
const arr: (string | number | null)[] = Array.from({ length: N }, (_, i) =>
i % 10 === 0 ? null : i % 3 === 0 ? `str_${i}` : i,
);

// Warm-up
for (let i = 0; i < 5; i++) hashArray(arr);

const ITERS = 20;
const start = performance.now();
for (let i = 0; i < ITERS; i++) hashArray(arr);
const total = performance.now() - start;

console.log(
JSON.stringify({
function: "hash_array",
mean_ms: total / ITERS,
iterations: ITERS,
total_ms: total,
}),
);
29 changes: 29 additions & 0 deletions benchmarks/tsb/bench_hash_biject_array.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { hashBijectArray, hashBijectInverse } from "../../src/index.js";

const N = 50_000;
const data: (string | number)[] = Array.from({ length: N }, (_, i) =>
i % 2 === 0 ? `label_${i % 1000}` : i % 1000,
);

// Warm-up
for (let i = 0; i < 10; i++) {
const codes = hashBijectArray(data);
hashBijectInverse(codes);
}

const iterations = 50;
const start = performance.now();
for (let i = 0; i < iterations; i++) {
const codes = hashBijectArray(data);
hashBijectInverse(codes);
}
const total_ms = performance.now() - start;

console.log(
JSON.stringify({
function: "hash_biject_array",
mean_ms: total_ms / iterations,
iterations,
total_ms,
}),
);
35 changes: 35 additions & 0 deletions benchmarks/tsb/bench_indexers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/**
* Benchmark: FixedForwardWindowIndexer and VariableOffsetWindowIndexer on 100k rows
*/
import { FixedForwardWindowIndexer, VariableOffsetWindowIndexer } from "../../src/index.js";

const ROWS = 100_000;
const WARMUP = 3;
const ITERATIONS = 10;

const offsets = Int32Array.from({ length: ROWS }, (_, i) => 3 + (i % 5));

const fixedIdx = new FixedForwardWindowIndexer({ windowSize: 10 });
const varIdx = new VariableOffsetWindowIndexer({ indexArray: offsets });

// Warm up
for (let i = 0; i < WARMUP; i++) {
fixedIdx.getWindowBounds(ROWS);
varIdx.getWindowBounds(ROWS);
}

const start = performance.now();
for (let i = 0; i < ITERATIONS; i++) {
fixedIdx.getWindowBounds(ROWS);
varIdx.getWindowBounds(ROWS);
}
const total = performance.now() - start;

console.log(
JSON.stringify({
function: "indexers",
mean_ms: total / ITERATIONS,
iterations: ITERATIONS,
total_ms: total,
}),
);
53 changes: 53 additions & 0 deletions benchmarks/tsb/bench_scalar_extract.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/**
* Benchmark: scalar extraction utilities (squeeze, firstValidIndex, lastValidIndex) on 100k rows
*/
import { DataFrame, Index, Series, squeezeSeries, squeezeDataFrame, firstValidIndex, lastValidIndex, dataFrameFirstValidIndex, dataFrameLastValidIndex } from "../../src/index.js";

const ROWS = 100_000;
const WARMUP = 3;
const ITERATIONS = 10;

// Series with some leading/trailing nulls
const data: (number | null)[] = Array.from({ length: ROWS }, (_, i) =>
i < 100 || i >= ROWS - 100 ? null : i * 0.1,
);
const s = new Series(data);
const s1 = new Series([42]);

// DataFrame with some nulls
const colA: (number | null)[] = Array.from({ length: ROWS }, (_, i) => (i < 50 ? null : i * 1.0));
const colB: (number | null)[] = Array.from({ length: ROWS }, (_, i) =>
i >= ROWS - 50 ? null : i * 2.0,
);
const df = new DataFrame({ A: colA, B: colB });
const df1col = new DataFrame({ A: colA });

// Warm up
for (let i = 0; i < WARMUP; i++) {
firstValidIndex(s);
lastValidIndex(s);
dataFrameFirstValidIndex(df);
dataFrameLastValidIndex(df);
squeezeSeries(s1);
squeezeDataFrame(df1col, 1);
}

const start = performance.now();
for (let i = 0; i < ITERATIONS; i++) {
firstValidIndex(s);
lastValidIndex(s);
dataFrameFirstValidIndex(df);
dataFrameLastValidIndex(df);
squeezeSeries(s1);
squeezeDataFrame(df1col, 1);
}
const total = performance.now() - start;

console.log(
JSON.stringify({
function: "scalar_extract",
mean_ms: total / ITERATIONS,
iterations: ITERATIONS,
total_ms: total,
}),
);
Loading
Loading