metrics

Evaluation metrics for NumPy and PyTorch with a unified API. Every function accepts both np.ndarray and torch.Tensor (with correct @overload type narrowing) and returns the matching type.

Installation

pip install git+https://github.com/fxeqxmulfx/metrics.git

Requires Python >= 3.10, NumPy >= 2.2 and PyTorch >= 2.11.

Norms

Function	Formula	Range	Params
`norm`	`(sum(\|x\|^p))^(1/p)`	`[0, +inf)`	`p` (default 2), `axis`
`l1_norm`	`sum(\|x\|)`	`[0, +inf)`	`axis`
`l2_norm`	`sqrt(sum(x^2))`	`[0, +inf)`	`axis`
`linf_norm`	`max(\|x\|)`	`[0, +inf)`	`axis`

import numpy as np
import torch
from metrics import norm, l1_norm, l2_norm, linf_norm

x = np.array([3.0, -4.0])

l1_norm(x)             # 7.0
l2_norm(x)             # 5.0
linf_norm(x)           # 4.0
norm(x, p=3)           # (27 + 64)^(1/3)
norm(x, p=0.5)         # fractional norm

# Per-row norms on 2-D
x_2d = np.array([[3.0, 0.0], [0.0, 4.0]])
l2_norm(x_2d, axis=1)  # [3.0, 4.0]

# PyTorch
xt = torch.tensor([3.0, -4.0])
l2_norm(xt)             # tensor(5.0)

Mean variants

Function	Formula	Range	Params
`mean`	`sum(x) / n`	`(-inf, +inf)`	`axis`
`geometric_mean`	`exp(mean(log(x)))`	`(0, +inf)`	`axis`
`harmonic_mean`	`1 / mean(1/x)`	`(0, +inf)`	`axis`
`power_mean`	`mean(x^p)^(1/p)`	depends on p	`p`, `axis`
`trimmed_mean`	mean after trimming tails	`(-inf, +inf)`	`proportiontocut`, `axis`

power_mean unifies all: p=-inf is min, p=-1 harmonic, p=0 geometric, p=1 arithmetic, p=2 quadratic (RMS), p=+inf is max.

For positive values: min <= harmonic <= geometric <= arithmetic <= RMS <= max.

import numpy as np
import torch
from metrics import (
    mean, geometric_mean, harmonic_mean, power_mean, trimmed_mean,
)

x = np.array([1.0, 2.0, 4.0, 8.0])

mean(x)                           # 3.75 (arithmetic)
geometric_mean(x)                 # 2.83 (exp(mean(log(x))))
harmonic_mean(x)                  # 2.13 (1 / mean(1/x))

# Power mean special cases
power_mean(x, p=-1)               # harmonic
power_mean(x, p=0)                # geometric
power_mean(x, p=1)                # arithmetic
power_mean(x, p=2)                # quadratic (RMS)
power_mean(x, p=float("inf"))     # max = 8.0
power_mean(x, p=float("-inf"))    # min = 1.0

# Trimmed mean: robust to outliers
y = np.array([1.0, 100.0, 3.0, 2.0, 200.0])
mean(y)                           # 61.2  (pulled by outliers)
trimmed_mean(y, 0.2)              # 35.0  (trims 20% from each tail)

# 2-D with axis
x_2d = np.array([[1.0, 4.0], [4.0, 16.0]])
geometric_mean(x_2d, axis=0)      # [2.0, 8.0]
harmonic_mean(x_2d, axis=0)       # [1.6, 6.4]
trimmed_mean(x_2d, 0.0, axis=1)   # per-row mean

# PyTorch
xt = torch.tensor([1.0, 2.0, 4.0, 8.0])
geometric_mean(xt)                 # tensor(2.83...)
power_mean(xt, p=2)                # tensor(...)

Summary statistics

Function	Description	Range	Params
`median`	Median value	`(-inf, +inf)`	`axis`
`std`	Standard deviation (Bessel's correction by default)	`[0, +inf)`	`axis`, `ddof` (default 1)
`perc`	Percentile	`(-inf, +inf)`	`q` in `[0, 100]`, `axis`
`spearman_corr`	Spearman rank correlation	`[-1, 1]`	`axis`
`cosine_similarity`	Cosine similarity	`[-1, 1]`	`axis`

import numpy as np
import torch
from metrics import median, std, perc, spearman_corr, cosine_similarity

# --- NumPy ---
x = np.array([2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0])

median(x)              # 4.5
std(x)                 # sample std (ddof=1)
std(x, ddof=0)         # population std
perc(x, 25)            # 25th percentile
perc(x, 75)            # 75th percentile

# 2-D with axis
x_2d = np.array([[1.0, 2.0], [3.0, 4.0]])
median(x_2d, axis=0)   # [2.0, 3.0]
std(x_2d, axis=0)      # per-column sample std
perc(x_2d, 50, axis=0) # per-column median

# Correlations
a = np.array([1.0, 2.0, 3.0, 4.0])
b = np.array([2.0, 4.0, 6.0, 8.0])

spearman_corr(a, b)    # 1.0 (perfect monotone)

# Spearman captures monotone non-linear relationships
c = np.array([1.0, 4.0, 9.0, 16.0])
spearman_corr(a, c)    # 1.0 (perfectly monotone)

# Per-column correlation on 2-D
x_2d = np.array([[1.0, 4.0], [2.0, 3.0], [3.0, 2.0], [4.0, 1.0]])
y_2d = np.array([[10.0, 40.0], [20.0, 30.0], [30.0, 20.0], [40.0, 10.0]])
spearman_corr(x_2d, y_2d, axis=0)  # [1.0, 1.0]

# --- PyTorch ---
xt = torch.tensor([1.0, 2.0, 3.0, 4.0])
yt = torch.tensor([2.0, 4.0, 6.0, 8.0])
spearman_corr(xt, yt)  # tensor(1.0)

# Cosine similarity
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 4.0, 6.0])
cosine_similarity(a, b)            # 1.0 (same direction)
cosine_similarity(a, -a)           # -1.0 (opposite)

# Scale-invariant
cosine_similarity(a, a * 100)      # 1.0

# Per-row similarity on 2-D (e.g. batch of embeddings)
x_2d = np.array([[1.0, 0.0], [0.0, 1.0]])
y_2d = np.array([[2.0, 0.0], [0.0, -1.0]])
cosine_similarity(x_2d, y_2d, axis=1)  # [1.0, -1.0]

Point-wise metrics

All point-wise metrics take (y_true, y_pred) and support an axis parameter.

Function	Formula	Range	Params
`mae`	`mean(\|y_true - y_pred\|)`	`[0, +inf)`	`axis`
`mad`	`median(\|y_true - y_pred\|)`	`[0, +inf)`	`axis`
`mse`	`mean((y_true - y_pred)^2)`	`[0, +inf)`	`axis`
`rmse`	`sqrt(mse)`	`[0, +inf)`	`axis`
`mape`	`mean(\|y_true - y_pred\| / (\|y_true\| + eps)) * 100`	`[0, +inf)` %	`eps` (default 1e-8), `axis`

import numpy as np
import torch
from metrics import mae, mad, mse, rmse, mape

# --- NumPy ---
y_true = np.array([[1.0, 2.0], [3.0, 4.0]])
y_pred = np.array([[1.5, 2.5], [3.5, 4.5]])

mae(y_true, y_pred)              # 0.5  (scalar, reduces all axes)
mae(y_true, y_pred, axis=0)      # [0.5, 0.5]  (per-column)
mae(y_true, y_pred, axis=1)      # [0.5, 0.5]  (per-row)
mad(y_true, y_pred)              # 0.5  (robust to outliers)
mse(y_true, y_pred)              # 0.25
rmse(y_true, y_pred)             # 0.5
mape(y_true, y_pred)             # percentage error
mape(y_true, y_pred, eps=1e-8)   # custom eps to avoid division by zero

# MAD is robust to outliers
y_true_out = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
y_pred_out = np.array([1.1, 2.1, 3.1, 4.1, 105.0])
mae(y_true_out, y_pred_out)      # 20.08  (inflated by outlier)
mad(y_true_out, y_pred_out)      # 0.1   (ignores outlier)

# --- PyTorch (same API, gradients flow through) ---
yt = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
yp = torch.tensor([[1.5, 2.5], [3.5, 4.5]], requires_grad=True)

loss = mse(yt, yp)
loss.backward()    # yp.grad is populated

Distribution metrics

All distribution metrics take two point clouds of shape (N, D) and (M, D).

Function	Description	Range	Params
`mmd`	Maximum Mean Discrepancy, RBF kernel (Gretton '06)	`[0, +inf)`	`bandwidth` (`None` = median heuristic)
`sinkhorn_ot`	Entropic regularized OT cost (Cuturi '13)	`[0, +inf)`	`eps` (default 1.0), `n_iter` (default 100)
`sinkhorn_divergence`	Debiased Sinkhorn divergence (Genevay, Peyre, Cuturi '18)	`[0, +inf)`	`eps` (default 1.0), `n_iter` (default 100)

MMD vs Sinkhorn divergence

Both measure distance between distributions. Sinkhorn divergence interpolates between Wasserstein distance and MMD, getting the best of both:

	MMD	Sinkhorn divergence
Speed	O(n^2), single pass	O(n^2) per iteration x `n_iter`
Sample efficiency	O(1/sqrt(n)), dimension-independent	O(1/sqrt(n)) for large eps
Geometry	Misses support away from dense areas	Recovers full support (small eps)
Bias	Unbiased	Debiased: `SD(x, x) = 0`
Use case	Quick two-sample test	Training loss for generative models

Asymptotic behaviour of eps:

eps -> 0: converges to Wasserstein distance W_c (geometric, but curse of dimension)
eps -> +inf: converges to 1/2 * MMD^2_{-c} (statistically efficient)

Rule of thumb: start with eps=1.0. Increase eps if gradients are noisy. Decrease eps (and increase n_iter) if the loss doesn't capture fine geometric structure.

import numpy as np
import torch
from metrics import mmd, sinkhorn_ot, sinkhorn_divergence

# --- MMD: quick comparison ---
x_np = np.random.randn(100, 2)
y_np = np.random.randn(100, 2) + 3.0

mmd(x_np, y_np)                    # auto bandwidth (median heuristic)
mmd(x_np, y_np, bandwidth=1.0)     # fixed bandwidth

# PyTorch
x = torch.randn(100, 2)
y = torch.randn(100, 2) + 3

mmd(x, y)                          # auto bandwidth (median heuristic)
mmd(x, y, bandwidth=0.5)           # fixed bandwidth

# --- Sinkhorn OT ---
sinkhorn_ot(x_np, y_np, eps=1.0)                # float
sinkhorn_ot(x, y, eps=1.0)                      # torch.Tensor (differentiable)
sinkhorn_ot(x, y, eps=0.1, n_iter=200)          # smaller eps needs more iterations

# --- Sinkhorn divergence: training loss for generative models ---
real = torch.randn(200, 64)
fake = torch.randn(200, 64, requires_grad=True)

loss = sinkhorn_divergence(real, fake, eps=1.0)  # balanced (recommended start)
loss.backward()                                   # gradients flow to fake

# eps controls the interpolation
sinkhorn_divergence(real, fake, eps=0.01, n_iter=500) # ≈ Wasserstein (geometric)
sinkhorn_divergence(real, fake, eps=1.0)               # balanced
sinkhorn_divergence(real, fake, eps=100.0)             # ≈ MMD (statistically efficient)

Sign correlation

Function	Description	Range	Params
`sign_corr`	Soft sign correlation of n-th finite differences	`[-1, 1]`	`n` (default 1), `sharpness` (default 10.0), `axis` (default 0, supports `None`)

+1 = all derivative signs match, -1 = all opposite, 0 = uncorrelated. Uses a differentiable sigmoid approximation (2 * sigmoid(sharpness * x) - 1) so gradients flow through for PyTorch.

sharpness controls the trade-off between differentiability and hard sign:

sharpness	Behaviour
~1	Very smooth, gradients are large but sign boundaries are blurry
~10 (default)	Good balance: near-binary output, still useful gradients
~100	Almost identical to hard `sign()`, gradients vanish far from zero
-> inf	Exact hard sign, non-differentiable (zero gradient everywhere except at 0)

In practice: use low sharpness (~1-5) when optimizing sign_corr as a loss, default (~10) for evaluation, and high sharpness (~100+) when you just want a hard match count.

import numpy as np
import torch
from metrics import sign_corr

# --- NumPy ---
old = np.array([0.0, 1.0, 4.0, 9.0, 16.0])
new = np.array([0.0, 1.1, 3.9, 9.2, 15.8])

sign_corr(old, new)                   # ~1.0, gradient signs match (n=1 default)
sign_corr(old, new, n=2)              # compare curvature signs instead
sign_corr(old, new, sharpness=100.0)  # sharper sigmoid, closer to hard sign

# Opposite curvature
convex  = np.array([4.0, 1.0, 0.0, 1.0, 4.0])   # x^2
concave = np.array([-4.0, -1.0, 0.0, -1.0, -4.0]) # -x^2
sign_corr(convex, concave)            # ~-1.0

# 2-D: per-column result
old_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
new_2d = np.array([[0.0, 0.0], [1.0, 1.0], [4.0, 0.0], [9.0, 1.0]])
sign_corr(old_2d, new_2d)             # [1.0, 1.0]  (axis=0 default)
sign_corr(old_2d, new_2d, axis=None)  # scalar over all elements

# --- PyTorch (differentiable) ---
old_t = torch.tensor([0.0, 1.0, 4.0, 9.0, 16.0])
new_t = torch.tensor([0.0, 1.1, 3.9, 9.2, 15.8], requires_grad=True)

score = sign_corr(old_t, new_t)
score.backward()                       # new_t.grad is populated

Fast DTW

Function	Description	Range	Params
`fast_dtw`	Fast Dynamic Time Warping (Salvador & Chan '04)	`[0, +inf)`	`radius` (default 1)

Approximates DTW in O(N) instead of O(N^2) by recursively coarsening the time series and refining the warp path within a constraint radius. Inputs are 1-D (N,) or 2-D (N, D) time series that may have different lengths.

import numpy as np
import torch
from metrics import fast_dtw

# --- NumPy ---
x = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
y = np.array([0.0, 1.0, 2.0, 3.0, 4.0])
fast_dtw(x, y)                        # 0.0 (identical)

# Different lengths
x = np.array([0.0, 1.0, 2.0])
y = np.array([0.0, 0.5, 1.0, 1.5, 2.0])
fast_dtw(x, y)                        # aligns despite length mismatch

# Multivariate time series (N, D)
x_2d = np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])
y_2d = np.array([[0.5, 0.5], [1.5, 1.5], [2.5, 2.5]])
fast_dtw(x_2d, y_2d)

# Larger radius = better approximation, slower
fast_dtw(x, y, radius=1)             # default, fastest
fast_dtw(x, y, radius=5)             # closer to exact DTW

# --- PyTorch ---
xt = torch.tensor([0.0, 1.0, 2.0, 3.0])
yt = torch.tensor([0.5, 1.5, 2.5, 3.5])
fast_dtw(xt, yt)                      # tensor(...)

Usage with pandas

All functions accept np.ndarray, so use .values (or .to_numpy()) to pass DataFrame columns or entire DataFrames directly.

import numpy as np
import pandas as pd
from metrics import (
    mae, mse, rmse, mape, mad,
    mean, geometric_mean, harmonic_mean, trimmed_mean, median, std, perc,
    l1_norm, l2_norm, linf_norm,
    spearman_corr, cosine_similarity, sign_corr,
    mmd, sinkhorn_divergence,
)

df = pd.DataFrame({
    "actual":    [100.0, 200.0, 300.0, 400.0, 500.0],
    "predicted": [110.0, 190.0, 310.0, 420.0, 480.0],
})

# ── Point-wise metrics on columns ──
mae(df["actual"].values, df["predicted"].values)    # scalar
mse(df["actual"].values, df["predicted"].values)
rmse(df["actual"].values, df["predicted"].values)
mape(df["actual"].values, df["predicted"].values)   # in percent
mad(df["actual"].values, df["predicted"].values)     # robust to outliers

# ── Summary stats on a single column ──
mean(df["actual"].values)
geometric_mean(df["actual"].values)
harmonic_mean(df["actual"].values)
trimmed_mean(df["actual"].values, 0.1)
median(df["actual"].values)
std(df["actual"].values)              # ddof=1 by default
perc(df["actual"].values, 90)         # 90th percentile

# ── Norms ──
l2_norm(df["actual"].values)

# ── Column-wise metrics on entire DataFrame ──
vals = df[["actual", "predicted"]].values  # shape (5, 2)
mean(vals, axis=0)                         # per-column mean
std(vals, axis=0)                          # per-column std

# ── Correlation between columns ──
spearman_corr(df["actual"].values, df["predicted"].values)

# ── Sign correlation (curvature agreement of time series) ──
sign_corr(df["actual"].values, df["predicted"].values)

# ── Cosine similarity between two feature vectors ──
cosine_similarity(df["actual"].values, df["predicted"].values)

# ── Distribution metrics (need 2-D point clouds) ──
# e.g. comparing two groups of multi-dimensional samples
group_a = df[["actual", "predicted"]].values[:3]   # (3, 2)
group_b = df[["actual", "predicted"]].values[2:]   # (3, 2)
mmd(group_a, group_b)
sinkhorn_divergence(group_a, group_b, eps=1.0)

# ── Apply per-group with groupby ──
df["group"] = ["A", "A", "B", "B", "B"]
df.groupby("group").apply(
    lambda g: mae(g["actual"].values, g["predicted"].values)
)

# ── Per-row: vectorized with axis=1 (fast, preferred) ──
# One call processes all rows at once via numpy — no Python loop.
features = pd.DataFrame({
    "f1": [1.0, 2.0, 3.0],
    "f2": [4.0, 5.0, 6.0],
    "f3": [7.0, 8.0, 9.0],
})
df_result = features.copy()
df_result["l2"] = l2_norm(features.values, axis=1)         # per-row L2 norm
df_result["mean"] = mean(features.values, axis=1)           # per-row mean
df_result["std"] = std(features.values, axis=1)             # per-row std

# Per-row between two column groups
df_emb = pd.DataFrame({
    "a1": [1.0, 0.0], "a2": [0.0, 1.0],
    "b1": [2.0, 0.0], "b2": [0.0, -1.0],
})
a_cols = df_emb[["a1", "a2"]].values  # (2, 2)
b_cols = df_emb[["b1", "b2"]].values  # (2, 2)
df_emb["cosine"] = cosine_similarity(a_cols, b_cols, axis=1)  # [1.0, -1.0]
df_emb["mae"] = mae(a_cols, b_cols, axis=1)                   # per-row MAE

# ── Per-row: df.apply (slow fallback for irregular data) ──
# Use only when columns contain lists or variable-length data
# that can't form a regular 2-D array.
df_pairs = pd.DataFrame({
    "actual":    [[1.0, 2.0, 3.0], [4.0, 5.0]],       # different lengths!
    "predicted": [[1.1, 2.2, 2.9], [4.5, 5.1]],
})
df_pairs["mae"] = df_pairs.apply(
    lambda row: mae(np.array(row["actual"]), np.array(row["predicted"])),
    axis=1,
)

Tests

uv run pytest test_metrics.py -v

Type checking

uv run mypy metrics.py --ignore-missing-imports

Name		Name	Last commit message	Last commit date
Latest commit History 6 Commits
.gitignore		.gitignore
.python-version		.python-version
LICENSE.txt		LICENSE.txt
README.md		README.md
metrics.py		metrics.py
pyproject.toml		pyproject.toml
test_metrics.py		test_metrics.py
test_types.py		test_types.py
uv.lock		uv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

metrics

Installation

Norms

Mean variants

Summary statistics

Point-wise metrics

Distribution metrics

MMD vs Sinkhorn divergence

Sign correlation

Fast DTW

Usage with pandas

Tests

Type checking

About

Uh oh!

Releases

Packages

Uh oh!

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

metrics

Installation

Norms

Mean variants

Summary statistics

Point-wise metrics

Distribution metrics

MMD vs Sinkhorn divergence

Sign correlation

Fast DTW

Usage with pandas

Tests

Type checking

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Uh oh!

Contributors

Uh oh!

Languages

Packages