Skip to content

fxeqxmulfx/metrics

Repository files navigation

metrics

Evaluation metrics for NumPy and PyTorch with a unified API. Every function accepts both np.ndarray and torch.Tensor (with correct @overload type narrowing) and returns the matching type.

Installation

pip install git+https://github.com/fxeqxmulfx/metrics.git

Requires Python >= 3.10, NumPy >= 2.2 and PyTorch >= 2.11.

Norms

Function Formula Range Params
norm (sum(|x|^p))^(1/p) [0, +inf) p (default 2), axis
l1_norm sum(|x|) [0, +inf) axis
l2_norm sqrt(sum(x^2)) [0, +inf) axis
linf_norm max(|x|) [0, +inf) axis
import numpy as np
import torch
from metrics import norm, l1_norm, l2_norm, linf_norm

x = np.array([3.0, -4.0])

l1_norm(x)             # 7.0
l2_norm(x)             # 5.0
linf_norm(x)           # 4.0
norm(x, p=3)           # (27 + 64)^(1/3)
norm(x, p=0.5)         # fractional norm

# Per-row norms on 2-D
x_2d = np.array([[3.0, 0.0], [0.0, 4.0]])
l2_norm(x_2d, axis=1)  # [3.0, 4.0]

# PyTorch
xt = torch.tensor([3.0, -4.0])
l2_norm(xt)             # tensor(5.0)

Mean variants

Function Formula Range Params
mean sum(x) / n (-inf, +inf) axis
geometric_mean exp(mean(log(x))) (0, +inf) axis
harmonic_mean 1 / mean(1/x) (0, +inf) axis
power_mean mean(x^p)^(1/p) depends on p p, axis
trimmed_mean mean after trimming tails (-inf, +inf) proportiontocut, axis

power_mean unifies all: p=-inf is min, p=-1 harmonic, p=0 geometric, p=1 arithmetic, p=2 quadratic (RMS), p=+inf is max.

For positive values: min <= harmonic <= geometric <= arithmetic <= RMS <= max.

import numpy as np
import torch
from metrics import (
    mean, geometric_mean, harmonic_mean, power_mean, trimmed_mean,
)

x = np.array([1.0, 2.0, 4.0, 8.0])

mean(x)                           # 3.75 (arithmetic)
geometric_mean(x)                 # 2.83 (exp(mean(log(x))))
harmonic_mean(x)                  # 2.13 (1 / mean(1/x))

# Power mean special cases
power_mean(x, p=-1)               # harmonic
power_mean(x, p=0)                # geometric
power_mean(x, p=1)                # arithmetic
power_mean(x, p=2)                # quadratic (RMS)
power_mean(x, p=float("inf"))     # max = 8.0
power_mean(x, p=float("-inf"))    # min = 1.0

# Trimmed mean: robust to outliers
y = np.array([1.0, 100.0, 3.0, 2.0, 200.0])
mean(y)                           # 61.2  (pulled by outliers)
trimmed_mean(y, 0.2)              # 35.0  (trims 20% from each tail)

# 2-D with axis
x_2d = np.array([[1.0, 4.0], [4.0, 16.0]])
geometric_mean(x_2d, axis=0)      # [2.0, 8.0]
harmonic_mean(x_2d, axis=0)       # [1.6, 6.4]
trimmed_mean(x_2d, 0.0, axis=1)   # per-row mean

# PyTorch
xt = torch.tensor([1.0, 2.0, 4.0, 8.0])
geometric_mean(xt)                 # tensor(2.83...)
power_mean(xt, p=2)                # tensor(...)

Summary statistics

Function Description Range Params
median Median value (-inf, +inf) axis
std Standard deviation (Bessel's correction by default) [0, +inf) axis, ddof (default 1)
perc Percentile (-inf, +inf) q in [0, 100], axis
spearman_corr Spearman rank correlation [-1, 1] axis
cosine_similarity Cosine similarity [-1, 1] axis
import numpy as np
import torch
from metrics import median, std, perc, spearman_corr, cosine_similarity

# --- NumPy ---
x = np.array([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0])

median(x)              # 4.5
std(x)                 # sample std (ddof=1)
std(x, ddof=0)         # population std
perc(x, 25)            # 25th percentile
perc(x, 75)            # 75th percentile

# 2-D with axis
x_2d = np.array([[1.0, 2.0], [3.0, 4.0]])
median(x_2d, axis=0)   # [2.0, 3.0]
std(x_2d, axis=0)      # per-column sample std
perc(x_2d, 50, axis=0) # per-column median

# Correlations
a = np.array([1.0, 2.0, 3.0, 4.0])
b = np.array([2.0, 4.0, 6.0, 8.0])

spearman_corr(a, b)    # 1.0 (perfect monotone)

# Spearman captures monotone non-linear relationships
c = np.array([1.0, 4.0, 9.0, 16.0])
spearman_corr(a, c)    # 1.0 (perfectly monotone)

# Per-column correlation on 2-D
x_2d = np.array([[1.0, 4.0], [2.0, 3.0], [3.0, 2.0], [4.0, 1.0]])
y_2d = np.array([[10.0, 40.0], [20.0, 30.0], [30.0, 20.0], [40.0, 10.0]])
spearman_corr(x_2d, y_2d, axis=0)  # [1.0, 1.0]

# --- PyTorch ---
xt = torch.tensor([1.0, 2.0, 3.0, 4.0])
yt = torch.tensor([2.0, 4.0, 6.0, 8.0])
spearman_corr(xt, yt)  # tensor(1.0)

# Cosine similarity
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 4.0, 6.0])
cosine_similarity(a, b)            # 1.0 (same direction)
cosine_similarity(a, -a)           # -1.0 (opposite)

# Scale-invariant
cosine_similarity(a, a * 100)      # 1.0

# Per-row similarity on 2-D (e.g. batch of embeddings)
x_2d = np.array([[1.0, 0.0], [0.0, 1.0]])
y_2d = np.array([[2.0, 0.0], [0.0, -1.0]])
cosine_similarity(x_2d, y_2d, axis=1)  # [1.0, -1.0]

Point-wise metrics

All point-wise metrics take (y_true, y_pred) and support an axis parameter.

Function Formula Range Params
mae mean(|y_true - y_pred|) [0, +inf) axis
mad median(|y_true - y_pred|) [0, +inf) axis
mse mean((y_true - y_pred)^2) [0, +inf) axis
rmse sqrt(mse) [0, +inf) axis
mape mean(|y_true - y_pred| / (|y_true| + eps)) * 100 [0, +inf) % eps (default 1e-8), axis
import numpy as np
import torch
from metrics import mae, mad, mse, rmse, mape

# --- NumPy ---
y_true = np.array([[1.0, 2.0], [3.0, 4.0]])
y_pred = np.array([[1.5, 2.5], [3.5, 4.5]])

mae(y_true, y_pred)              # 0.5  (scalar, reduces all axes)
mae(y_true, y_pred, axis=0)      # [0.5, 0.5]  (per-column)
mae(y_true, y_pred, axis=1)      # [0.5, 0.5]  (per-row)
mad(y_true, y_pred)              # 0.5  (robust to outliers)
mse(y_true, y_pred)              # 0.25
rmse(y_true, y_pred)             # 0.5
mape(y_true, y_pred)             # percentage error
mape(y_true, y_pred, eps=1e-8)   # custom eps to avoid division by zero

# MAD is robust to outliers
y_true_out = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
y_pred_out = np.array([1.1, 2.1, 3.1, 4.1, 105.0])
mae(y_true_out, y_pred_out)      # 20.08  (inflated by outlier)
mad(y_true_out, y_pred_out)      # 0.1   (ignores outlier)

# --- PyTorch (same API, gradients flow through) ---
yt = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
yp = torch.tensor([[1.5, 2.5], [3.5, 4.5]], requires_grad=True)

loss = mse(yt, yp)
loss.backward()    # yp.grad is populated

Distribution metrics

All distribution metrics take two point clouds of shape (N, D) and (M, D).

Function Description Range Params
mmd Maximum Mean Discrepancy, RBF kernel (Gretton '06) [0, +inf) bandwidth (None = median heuristic)
sinkhorn_ot Entropic regularized OT cost (Cuturi '13) [0, +inf) eps (default 1.0), n_iter (default 100)
sinkhorn_divergence Debiased Sinkhorn divergence (Genevay, Peyre, Cuturi '18) [0, +inf) eps (default 1.0), n_iter (default 100)

MMD vs Sinkhorn divergence

Both measure distance between distributions. Sinkhorn divergence interpolates between Wasserstein distance and MMD, getting the best of both:

MMD Sinkhorn divergence
Speed O(n^2), single pass O(n^2) per iteration x n_iter
Sample efficiency O(1/sqrt(n)), dimension-independent O(1/sqrt(n)) for large eps
Geometry Misses support away from dense areas Recovers full support (small eps)
Bias Unbiased Debiased: SD(x, x) = 0
Use case Quick two-sample test Training loss for generative models

Asymptotic behaviour of eps:

  • eps -> 0: converges to Wasserstein distance W_c (geometric, but curse of dimension)
  • eps -> +inf: converges to 1/2 * MMD^2_{-c} (statistically efficient)

Rule of thumb: start with eps=1.0. Increase eps if gradients are noisy. Decrease eps (and increase n_iter) if the loss doesn't capture fine geometric structure.

import numpy as np
import torch
from metrics import mmd, sinkhorn_ot, sinkhorn_divergence

# --- MMD: quick comparison ---
x_np = np.random.randn(100, 2)
y_np = np.random.randn(100, 2) + 3.0

mmd(x_np, y_np)                    # auto bandwidth (median heuristic)
mmd(x_np, y_np, bandwidth=1.0)     # fixed bandwidth

# PyTorch
x = torch.randn(100, 2)
y = torch.randn(100, 2) + 3

mmd(x, y)                          # auto bandwidth (median heuristic)
mmd(x, y, bandwidth=0.5)           # fixed bandwidth

# --- Sinkhorn OT ---
sinkhorn_ot(x_np, y_np, eps=1.0)                # float
sinkhorn_ot(x, y, eps=1.0)                      # torch.Tensor (differentiable)
sinkhorn_ot(x, y, eps=0.1, n_iter=200)          # smaller eps needs more iterations

# --- Sinkhorn divergence: training loss for generative models ---
real = torch.randn(200, 64)
fake = torch.randn(200, 64, requires_grad=True)

loss = sinkhorn_divergence(real, fake, eps=1.0)  # balanced (recommended start)
loss.backward()                                   # gradients flow to fake

# eps controls the interpolation
sinkhorn_divergence(real, fake, eps=0.01, n_iter=500) # ≈ Wasserstein (geometric)
sinkhorn_divergence(real, fake, eps=1.0)               # balanced
sinkhorn_divergence(real, fake, eps=100.0)             # ≈ MMD (statistically efficient)

Sign correlation

Function Description Range Params
sign_corr Soft sign correlation of n-th finite differences [-1, 1] n (default 1), sharpness (default 10.0), axis (default 0, supports None)

+1 = all derivative signs match, -1 = all opposite, 0 = uncorrelated. Uses a differentiable sigmoid approximation (2 * sigmoid(sharpness * x) - 1) so gradients flow through for PyTorch.

sharpness controls the trade-off between differentiability and hard sign:

sharpness Behaviour
~1 Very smooth, gradients are large but sign boundaries are blurry
~10 (default) Good balance: near-binary output, still useful gradients
~100 Almost identical to hard sign(), gradients vanish far from zero
-> inf Exact hard sign, non-differentiable (zero gradient everywhere except at 0)

In practice: use low sharpness (~1-5) when optimizing sign_corr as a loss, default (~10) for evaluation, and high sharpness (~100+) when you just want a hard match count.

import numpy as np
import torch
from metrics import sign_corr

# --- NumPy ---
old = np.array([0.0, 1.0, 4.0, 9.0, 16.0])
new = np.array([0.0, 1.1, 3.9, 9.2, 15.8])

sign_corr(old, new)                   # ~1.0, gradient signs match (n=1 default)
sign_corr(old, new, n=2)              # compare curvature signs instead
sign_corr(old, new, sharpness=100.0)  # sharper sigmoid, closer to hard sign

# Opposite curvature
convex  = np.array([4.0, 1.0, 0.0, 1.0, 4.0])   # x^2
concave = np.array([-4.0, -1.0, 0.0, -1.0, -4.0]) # -x^2
sign_corr(convex, concave)            # ~-1.0

# 2-D: per-column result
old_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
new_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
sign_corr(old_2d, new_2d)             # [1.0, 1.0]  (axis=0 default)
sign_corr(old_2d, new_2d, axis=None)  # scalar over all elements

# --- PyTorch (differentiable) ---
old_t = torch.tensor([0.0, 1.0, 4.0, 9.0, 16.0])
new_t = torch.tensor([0.0, 1.1, 3.9, 9.2, 15.8], requires_grad=True)

score = sign_corr(old_t, new_t)
score.backward()                       # new_t.grad is populated

Fast DTW

Function Description Range Params
fast_dtw Fast Dynamic Time Warping (Salvador & Chan '04) [0, +inf) radius (default 1)

Approximates DTW in O(N) instead of O(N^2) by recursively coarsening the time series and refining the warp path within a constraint radius. Inputs are 1-D (N,) or 2-D (N, D) time series that may have different lengths.

import numpy as np
import torch
from metrics import fast_dtw

# --- NumPy ---
x = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
y = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
fast_dtw(x, y)                        # 0.0 (identical)

# Different lengths
x = np.array([0.0, 1.0, 2.0])
y = np.array([0.0, 0.5, 1.0, 1.5, 2.0])
fast_dtw(x, y)                        # aligns despite length mismatch

# Multivariate time series (N, D)
x_2d = np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])
y_2d = np.array([[0.5, 0.5], [1.5, 1.5], [2.5, 2.5]])
fast_dtw(x_2d, y_2d)

# Larger radius = better approximation, slower
fast_dtw(x, y, radius=1)             # default, fastest
fast_dtw(x, y, radius=5)             # closer to exact DTW

# --- PyTorch ---
xt = torch.tensor([0.0, 1.0, 2.0, 3.0])
yt = torch.tensor([0.5, 1.5, 2.5, 3.5])
fast_dtw(xt, yt)                      # tensor(...)

Usage with pandas

All functions accept np.ndarray, so use .values (or .to_numpy()) to pass DataFrame columns or entire DataFrames directly.

import numpy as np
import pandas as pd
from metrics import (
    mae, mse, rmse, mape, mad,
    mean, geometric_mean, harmonic_mean, trimmed_mean, median, std, perc,
    l1_norm, l2_norm, linf_norm,
    spearman_corr, cosine_similarity, sign_corr,
    mmd, sinkhorn_divergence,
)

df = pd.DataFrame({
    "actual":    [100.0, 200.0, 300.0, 400.0, 500.0],
    "predicted": [110.0, 190.0, 310.0, 420.0, 480.0],
})

# ── Point-wise metrics on columns ──
mae(df["actual"].values, df["predicted"].values)    # scalar
mse(df["actual"].values, df["predicted"].values)
rmse(df["actual"].values, df["predicted"].values)
mape(df["actual"].values, df["predicted"].values)   # in percent
mad(df["actual"].values, df["predicted"].values)     # robust to outliers

# ── Summary stats on a single column ──
mean(df["actual"].values)
geometric_mean(df["actual"].values)
harmonic_mean(df["actual"].values)
trimmed_mean(df["actual"].values, 0.1)
median(df["actual"].values)
std(df["actual"].values)              # ddof=1 by default
perc(df["actual"].values, 90)         # 90th percentile

# ── Norms ──
l2_norm(df["actual"].values)

# ── Column-wise metrics on entire DataFrame ──
vals = df[["actual", "predicted"]].values  # shape (5, 2)
mean(vals, axis=0)                         # per-column mean
std(vals, axis=0)                          # per-column std

# ── Correlation between columns ──
spearman_corr(df["actual"].values, df["predicted"].values)

# ── Sign correlation (curvature agreement of time series) ──
sign_corr(df["actual"].values, df["predicted"].values)

# ── Cosine similarity between two feature vectors ──
cosine_similarity(df["actual"].values, df["predicted"].values)

# ── Distribution metrics (need 2-D point clouds) ──
# e.g. comparing two groups of multi-dimensional samples
group_a = df[["actual", "predicted"]].values[:3]   # (3, 2)
group_b = df[["actual", "predicted"]].values[2:]   # (3, 2)
mmd(group_a, group_b)
sinkhorn_divergence(group_a, group_b, eps=1.0)

# ── Apply per-group with groupby ──
df["group"] = ["A", "A", "B", "B", "B"]
df.groupby("group").apply(
    lambda g: mae(g["actual"].values, g["predicted"].values)
)

# ── Per-row: vectorized with axis=1 (fast, preferred) ──
# One call processes all rows at once via numpy — no Python loop.
features = pd.DataFrame({
    "f1": [1.0, 2.0, 3.0],
    "f2": [4.0, 5.0, 6.0],
    "f3": [7.0, 8.0, 9.0],
})
df_result = features.copy()
df_result["l2"] = l2_norm(features.values, axis=1)         # per-row L2 norm
df_result["mean"] = mean(features.values, axis=1)           # per-row mean
df_result["std"] = std(features.values, axis=1)             # per-row std

# Per-row between two column groups
df_emb = pd.DataFrame({
    "a1": [1.0, 0.0], "a2": [0.0, 1.0],
    "b1": [2.0, 0.0], "b2": [0.0, -1.0],
})
a_cols = df_emb[["a1", "a2"]].values  # (2, 2)
b_cols = df_emb[["b1", "b2"]].values  # (2, 2)
df_emb["cosine"] = cosine_similarity(a_cols, b_cols, axis=1)  # [1.0, -1.0]
df_emb["mae"] = mae(a_cols, b_cols, axis=1)                   # per-row MAE

# ── Per-row: df.apply (slow fallback for irregular data) ──
# Use only when columns contain lists or variable-length data
# that can't form a regular 2-D array.
df_pairs = pd.DataFrame({
    "actual":    [[1.0, 2.0, 3.0], [4.0, 5.0]],       # different lengths!
    "predicted": [[1.1, 2.2, 2.9], [4.5, 5.1]],
})
df_pairs["mae"] = df_pairs.apply(
    lambda row: mae(np.array(row["actual"]), np.array(row["predicted"])),
    axis=1,
)

Tests

uv run pytest test_metrics.py -v

Type checking

uv run mypy metrics.py --ignore-missing-imports

About

No description, website, or topics provided.

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors

Languages