Evaluation metrics for NumPy and PyTorch with a unified API.
Every function accepts both np.ndarray and torch.Tensor (with correct
@overload type narrowing) and returns the matching type.
pip install git+https://github.com/fxeqxmulfx/metrics.gitRequires Python >= 3.10, NumPy >= 2.2 and PyTorch >= 2.11.
| Function | Formula | Range | Params |
|---|---|---|---|
norm |
(sum(|x|^p))^(1/p) |
[0, +inf) |
p (default 2), axis |
l1_norm |
sum(|x|) |
[0, +inf) |
axis |
l2_norm |
sqrt(sum(x^2)) |
[0, +inf) |
axis |
linf_norm |
max(|x|) |
[0, +inf) |
axis |
import numpy as np
import torch
from metrics import norm, l1_norm, l2_norm, linf_norm
x = np.array([3.0, -4.0])
l1_norm(x) # 7.0
l2_norm(x) # 5.0
linf_norm(x) # 4.0
norm(x, p=3) # (27 + 64)^(1/3)
norm(x, p=0.5) # fractional norm
# Per-row norms on 2-D
x_2d = np.array([[3.0, 0.0], [0.0, 4.0]])
l2_norm(x_2d, axis=1) # [3.0, 4.0]
# PyTorch
xt = torch.tensor([3.0, -4.0])
l2_norm(xt) # tensor(5.0)| Function | Formula | Range | Params |
|---|---|---|---|
mean |
sum(x) / n |
(-inf, +inf) |
axis |
geometric_mean |
exp(mean(log(x))) |
(0, +inf) |
axis |
harmonic_mean |
1 / mean(1/x) |
(0, +inf) |
axis |
power_mean |
mean(x^p)^(1/p) |
depends on p | p, axis |
trimmed_mean |
mean after trimming tails | (-inf, +inf) |
proportiontocut, axis |
power_mean unifies all: p=-inf is min, p=-1 harmonic, p=0 geometric,
p=1 arithmetic, p=2 quadratic (RMS), p=+inf is max.
For positive values: min <= harmonic <= geometric <= arithmetic <= RMS <= max.
import numpy as np
import torch
from metrics import (
mean, geometric_mean, harmonic_mean, power_mean, trimmed_mean,
)
x = np.array([1.0, 2.0, 4.0, 8.0])
mean(x) # 3.75 (arithmetic)
geometric_mean(x) # 2.83 (exp(mean(log(x))))
harmonic_mean(x) # 2.13 (1 / mean(1/x))
# Power mean special cases
power_mean(x, p=-1) # harmonic
power_mean(x, p=0) # geometric
power_mean(x, p=1) # arithmetic
power_mean(x, p=2) # quadratic (RMS)
power_mean(x, p=float("inf")) # max = 8.0
power_mean(x, p=float("-inf")) # min = 1.0
# Trimmed mean: robust to outliers
y = np.array([1.0, 100.0, 3.0, 2.0, 200.0])
mean(y) # 61.2 (pulled by outliers)
trimmed_mean(y, 0.2) # 35.0 (trims 20% from each tail)
# 2-D with axis
x_2d = np.array([[1.0, 4.0], [4.0, 16.0]])
geometric_mean(x_2d, axis=0) # [2.0, 8.0]
harmonic_mean(x_2d, axis=0) # [1.6, 6.4]
trimmed_mean(x_2d, 0.0, axis=1) # per-row mean
# PyTorch
xt = torch.tensor([1.0, 2.0, 4.0, 8.0])
geometric_mean(xt) # tensor(2.83...)
power_mean(xt, p=2) # tensor(...)| Function | Description | Range | Params |
|---|---|---|---|
median |
Median value | (-inf, +inf) |
axis |
std |
Standard deviation (Bessel's correction by default) | [0, +inf) |
axis, ddof (default 1) |
perc |
Percentile | (-inf, +inf) |
q in [0, 100], axis |
spearman_corr |
Spearman rank correlation | [-1, 1] |
axis |
cosine_similarity |
Cosine similarity | [-1, 1] |
axis |
import numpy as np
import torch
from metrics import median, std, perc, spearman_corr, cosine_similarity
# --- NumPy ---
x = np.array([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0])
median(x) # 4.5
std(x) # sample std (ddof=1)
std(x, ddof=0) # population std
perc(x, 25) # 25th percentile
perc(x, 75) # 75th percentile
# 2-D with axis
x_2d = np.array([[1.0, 2.0], [3.0, 4.0]])
median(x_2d, axis=0) # [2.0, 3.0]
std(x_2d, axis=0) # per-column sample std
perc(x_2d, 50, axis=0) # per-column median
# Correlations
a = np.array([1.0, 2.0, 3.0, 4.0])
b = np.array([2.0, 4.0, 6.0, 8.0])
spearman_corr(a, b) # 1.0 (perfect monotone)
# Spearman captures monotone non-linear relationships
c = np.array([1.0, 4.0, 9.0, 16.0])
spearman_corr(a, c) # 1.0 (perfectly monotone)
# Per-column correlation on 2-D
x_2d = np.array([[1.0, 4.0], [2.0, 3.0], [3.0, 2.0], [4.0, 1.0]])
y_2d = np.array([[10.0, 40.0], [20.0, 30.0], [30.0, 20.0], [40.0, 10.0]])
spearman_corr(x_2d, y_2d, axis=0) # [1.0, 1.0]
# --- PyTorch ---
xt = torch.tensor([1.0, 2.0, 3.0, 4.0])
yt = torch.tensor([2.0, 4.0, 6.0, 8.0])
spearman_corr(xt, yt) # tensor(1.0)
# Cosine similarity
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 4.0, 6.0])
cosine_similarity(a, b) # 1.0 (same direction)
cosine_similarity(a, -a) # -1.0 (opposite)
# Scale-invariant
cosine_similarity(a, a * 100) # 1.0
# Per-row similarity on 2-D (e.g. batch of embeddings)
x_2d = np.array([[1.0, 0.0], [0.0, 1.0]])
y_2d = np.array([[2.0, 0.0], [0.0, -1.0]])
cosine_similarity(x_2d, y_2d, axis=1) # [1.0, -1.0]All point-wise metrics take (y_true, y_pred) and support an axis parameter.
| Function | Formula | Range | Params |
|---|---|---|---|
mae |
mean(|y_true - y_pred|) |
[0, +inf) |
axis |
mad |
median(|y_true - y_pred|) |
[0, +inf) |
axis |
mse |
mean((y_true - y_pred)^2) |
[0, +inf) |
axis |
rmse |
sqrt(mse) |
[0, +inf) |
axis |
mape |
mean(|y_true - y_pred| / (|y_true| + eps)) * 100 |
[0, +inf) % |
eps (default 1e-8), axis |
import numpy as np
import torch
from metrics import mae, mad, mse, rmse, mape
# --- NumPy ---
y_true = np.array([[1.0, 2.0], [3.0, 4.0]])
y_pred = np.array([[1.5, 2.5], [3.5, 4.5]])
mae(y_true, y_pred) # 0.5 (scalar, reduces all axes)
mae(y_true, y_pred, axis=0) # [0.5, 0.5] (per-column)
mae(y_true, y_pred, axis=1) # [0.5, 0.5] (per-row)
mad(y_true, y_pred) # 0.5 (robust to outliers)
mse(y_true, y_pred) # 0.25
rmse(y_true, y_pred) # 0.5
mape(y_true, y_pred) # percentage error
mape(y_true, y_pred, eps=1e-8) # custom eps to avoid division by zero
# MAD is robust to outliers
y_true_out = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
y_pred_out = np.array([1.1, 2.1, 3.1, 4.1, 105.0])
mae(y_true_out, y_pred_out) # 20.08 (inflated by outlier)
mad(y_true_out, y_pred_out) # 0.1 (ignores outlier)
# --- PyTorch (same API, gradients flow through) ---
yt = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
yp = torch.tensor([[1.5, 2.5], [3.5, 4.5]], requires_grad=True)
loss = mse(yt, yp)
loss.backward() # yp.grad is populatedAll distribution metrics take two point clouds of shape (N, D) and (M, D).
| Function | Description | Range | Params |
|---|---|---|---|
mmd |
Maximum Mean Discrepancy, RBF kernel (Gretton '06) | [0, +inf) |
bandwidth (None = median heuristic) |
sinkhorn_ot |
Entropic regularized OT cost (Cuturi '13) | [0, +inf) |
eps (default 1.0), n_iter (default 100) |
sinkhorn_divergence |
Debiased Sinkhorn divergence (Genevay, Peyre, Cuturi '18) | [0, +inf) |
eps (default 1.0), n_iter (default 100) |
Both measure distance between distributions. Sinkhorn divergence interpolates between Wasserstein distance and MMD, getting the best of both:
| MMD | Sinkhorn divergence | |
|---|---|---|
| Speed | O(n^2), single pass | O(n^2) per iteration x n_iter |
| Sample efficiency | O(1/sqrt(n)), dimension-independent | O(1/sqrt(n)) for large eps |
| Geometry | Misses support away from dense areas | Recovers full support (small eps) |
| Bias | Unbiased | Debiased: SD(x, x) = 0 |
| Use case | Quick two-sample test | Training loss for generative models |
Asymptotic behaviour of eps:
eps -> 0: converges to Wasserstein distanceW_c(geometric, but curse of dimension)eps -> +inf: converges to1/2 * MMD^2_{-c}(statistically efficient)
Rule of thumb: start with eps=1.0. Increase eps if gradients are noisy.
Decrease eps (and increase n_iter) if the loss doesn't capture fine geometric
structure.
import numpy as np
import torch
from metrics import mmd, sinkhorn_ot, sinkhorn_divergence
# --- MMD: quick comparison ---
x_np = np.random.randn(100, 2)
y_np = np.random.randn(100, 2) + 3.0
mmd(x_np, y_np) # auto bandwidth (median heuristic)
mmd(x_np, y_np, bandwidth=1.0) # fixed bandwidth
# PyTorch
x = torch.randn(100, 2)
y = torch.randn(100, 2) + 3
mmd(x, y) # auto bandwidth (median heuristic)
mmd(x, y, bandwidth=0.5) # fixed bandwidth
# --- Sinkhorn OT ---
sinkhorn_ot(x_np, y_np, eps=1.0) # float
sinkhorn_ot(x, y, eps=1.0) # torch.Tensor (differentiable)
sinkhorn_ot(x, y, eps=0.1, n_iter=200) # smaller eps needs more iterations
# --- Sinkhorn divergence: training loss for generative models ---
real = torch.randn(200, 64)
fake = torch.randn(200, 64, requires_grad=True)
loss = sinkhorn_divergence(real, fake, eps=1.0) # balanced (recommended start)
loss.backward() # gradients flow to fake
# eps controls the interpolation
sinkhorn_divergence(real, fake, eps=0.01, n_iter=500) # ≈ Wasserstein (geometric)
sinkhorn_divergence(real, fake, eps=1.0) # balanced
sinkhorn_divergence(real, fake, eps=100.0) # ≈ MMD (statistically efficient)| Function | Description | Range | Params |
|---|---|---|---|
sign_corr |
Soft sign correlation of n-th finite differences | [-1, 1] |
n (default 1), sharpness (default 10.0), axis (default 0, supports None) |
+1 = all derivative signs match, -1 = all opposite, 0 = uncorrelated.
Uses a differentiable sigmoid approximation (2 * sigmoid(sharpness * x) - 1)
so gradients flow through for PyTorch.
sharpness controls the trade-off between differentiability and hard sign:
| sharpness | Behaviour |
|---|---|
| ~1 | Very smooth, gradients are large but sign boundaries are blurry |
| ~10 (default) | Good balance: near-binary output, still useful gradients |
| ~100 | Almost identical to hard sign(), gradients vanish far from zero |
| -> inf | Exact hard sign, non-differentiable (zero gradient everywhere except at 0) |
In practice: use low sharpness (~1-5) when optimizing sign_corr as a loss,
default (~10) for evaluation, and high sharpness (~100+) when you just want
a hard match count.
import numpy as np
import torch
from metrics import sign_corr
# --- NumPy ---
old = np.array([0.0, 1.0, 4.0, 9.0, 16.0])
new = np.array([0.0, 1.1, 3.9, 9.2, 15.8])
sign_corr(old, new) # ~1.0, gradient signs match (n=1 default)
sign_corr(old, new, n=2) # compare curvature signs instead
sign_corr(old, new, sharpness=100.0) # sharper sigmoid, closer to hard sign
# Opposite curvature
convex = np.array([4.0, 1.0, 0.0, 1.0, 4.0]) # x^2
concave = np.array([-4.0, -1.0, 0.0, -1.0, -4.0]) # -x^2
sign_corr(convex, concave) # ~-1.0
# 2-D: per-column result
old_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
new_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
sign_corr(old_2d, new_2d) # [1.0, 1.0] (axis=0 default)
sign_corr(old_2d, new_2d, axis=None) # scalar over all elements
# --- PyTorch (differentiable) ---
old_t = torch.tensor([0.0, 1.0, 4.0, 9.0, 16.0])
new_t = torch.tensor([0.0, 1.1, 3.9, 9.2, 15.8], requires_grad=True)
score = sign_corr(old_t, new_t)
score.backward() # new_t.grad is populated| Function | Description | Range | Params |
|---|---|---|---|
fast_dtw |
Fast Dynamic Time Warping (Salvador & Chan '04) | [0, +inf) |
radius (default 1) |
Approximates DTW in O(N) instead of O(N^2) by recursively coarsening the time
series and refining the warp path within a constraint radius. Inputs are 1-D
(N,) or 2-D (N, D) time series that may have different lengths.
import numpy as np
import torch
from metrics import fast_dtw
# --- NumPy ---
x = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
y = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
fast_dtw(x, y) # 0.0 (identical)
# Different lengths
x = np.array([0.0, 1.0, 2.0])
y = np.array([0.0, 0.5, 1.0, 1.5, 2.0])
fast_dtw(x, y) # aligns despite length mismatch
# Multivariate time series (N, D)
x_2d = np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])
y_2d = np.array([[0.5, 0.5], [1.5, 1.5], [2.5, 2.5]])
fast_dtw(x_2d, y_2d)
# Larger radius = better approximation, slower
fast_dtw(x, y, radius=1) # default, fastest
fast_dtw(x, y, radius=5) # closer to exact DTW
# --- PyTorch ---
xt = torch.tensor([0.0, 1.0, 2.0, 3.0])
yt = torch.tensor([0.5, 1.5, 2.5, 3.5])
fast_dtw(xt, yt) # tensor(...)All functions accept np.ndarray, so use .values (or .to_numpy()) to pass
DataFrame columns or entire DataFrames directly.
import numpy as np
import pandas as pd
from metrics import (
mae, mse, rmse, mape, mad,
mean, geometric_mean, harmonic_mean, trimmed_mean, median, std, perc,
l1_norm, l2_norm, linf_norm,
spearman_corr, cosine_similarity, sign_corr,
mmd, sinkhorn_divergence,
)
df = pd.DataFrame({
"actual": [100.0, 200.0, 300.0, 400.0, 500.0],
"predicted": [110.0, 190.0, 310.0, 420.0, 480.0],
})
# ── Point-wise metrics on columns ──
mae(df["actual"].values, df["predicted"].values) # scalar
mse(df["actual"].values, df["predicted"].values)
rmse(df["actual"].values, df["predicted"].values)
mape(df["actual"].values, df["predicted"].values) # in percent
mad(df["actual"].values, df["predicted"].values) # robust to outliers
# ── Summary stats on a single column ──
mean(df["actual"].values)
geometric_mean(df["actual"].values)
harmonic_mean(df["actual"].values)
trimmed_mean(df["actual"].values, 0.1)
median(df["actual"].values)
std(df["actual"].values) # ddof=1 by default
perc(df["actual"].values, 90) # 90th percentile
# ── Norms ──
l2_norm(df["actual"].values)
# ── Column-wise metrics on entire DataFrame ──
vals = df[["actual", "predicted"]].values # shape (5, 2)
mean(vals, axis=0) # per-column mean
std(vals, axis=0) # per-column std
# ── Correlation between columns ──
spearman_corr(df["actual"].values, df["predicted"].values)
# ── Sign correlation (curvature agreement of time series) ──
sign_corr(df["actual"].values, df["predicted"].values)
# ── Cosine similarity between two feature vectors ──
cosine_similarity(df["actual"].values, df["predicted"].values)
# ── Distribution metrics (need 2-D point clouds) ──
# e.g. comparing two groups of multi-dimensional samples
group_a = df[["actual", "predicted"]].values[:3] # (3, 2)
group_b = df[["actual", "predicted"]].values[2:] # (3, 2)
mmd(group_a, group_b)
sinkhorn_divergence(group_a, group_b, eps=1.0)
# ── Apply per-group with groupby ──
df["group"] = ["A", "A", "B", "B", "B"]
df.groupby("group").apply(
lambda g: mae(g["actual"].values, g["predicted"].values)
)
# ── Per-row: vectorized with axis=1 (fast, preferred) ──
# One call processes all rows at once via numpy — no Python loop.
features = pd.DataFrame({
"f1": [1.0, 2.0, 3.0],
"f2": [4.0, 5.0, 6.0],
"f3": [7.0, 8.0, 9.0],
})
df_result = features.copy()
df_result["l2"] = l2_norm(features.values, axis=1) # per-row L2 norm
df_result["mean"] = mean(features.values, axis=1) # per-row mean
df_result["std"] = std(features.values, axis=1) # per-row std
# Per-row between two column groups
df_emb = pd.DataFrame({
"a1": [1.0, 0.0], "a2": [0.0, 1.0],
"b1": [2.0, 0.0], "b2": [0.0, -1.0],
})
a_cols = df_emb[["a1", "a2"]].values # (2, 2)
b_cols = df_emb[["b1", "b2"]].values # (2, 2)
df_emb["cosine"] = cosine_similarity(a_cols, b_cols, axis=1) # [1.0, -1.0]
df_emb["mae"] = mae(a_cols, b_cols, axis=1) # per-row MAE
# ── Per-row: df.apply (slow fallback for irregular data) ──
# Use only when columns contain lists or variable-length data
# that can't form a regular 2-D array.
df_pairs = pd.DataFrame({
"actual": [[1.0, 2.0, 3.0], [4.0, 5.0]], # different lengths!
"predicted": [[1.1, 2.2, 2.9], [4.5, 5.1]],
})
df_pairs["mae"] = df_pairs.apply(
lambda row: mae(np.array(row["actual"]), np.array(row["predicted"])),
axis=1,
)uv run pytest test_metrics.py -vuv run mypy metrics.py --ignore-missing-imports