From 221dbfbf6d47724ce7cf48b56d732c432e4d7faa Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 6 Jun 2025 18:14:04 -0400 Subject: [PATCH 1/5] change coverage test to two tail p-value --- README.md | 24 ++++++++++++++++++++++-- src/pted/pted.py | 8 +++++--- src/pted/tests.py | 2 +- src/pted/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1c26424..253b47d 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ print(f"p-value: {p_value:.3f}") # expect uniform random from 0-1 ## How it works +### Two sample test + PTED uses the energy distance of the two samples `x` and `y`, this is computed as: $$d = \frac{2}{n_xn_y}\sum_{i,j}||x_i - y_j|| - \frac{1}{n_x^2}\sum_{i,j}||x_i - x_j|| - \frac{1}{n_y^2}\sum_{i,j}||y_i - y_j||$$ @@ -105,6 +107,21 @@ grey distribution is just a histogram of many re-runs of this procedure. We compute a p-value by taking the fraction of the energy distances that are greater than the current one. +### Coverage test + +In the coverage test we have some number of simulations `nsim` where there is a +true value `g` and some posterior samples `s`. For each simulation separately we +use PTED to compute a p-value, essentially asking the question "was `g` drawn +from the distribution that generated `s`?". Individually, these tests are not +especially informative, however their p-values must have been drawn from +`U(0,1)` under the null-hypothesis. Thus we just need a way to combine their +statistical power. It turns out that for some `p ~ U(0,1)` value, we have that +`- 2 ln(p)` is chi2 distributed with `dof = 2`. This means that we can sum the +chi2 values for the PTED test on each simulation and compare with a chi2 +distribution with `dof = 2 * nsim`. We use a density based two tailed p-value +test on this chi2 distribution meaning that if your posterior is underconfident +or overconfident, you will get a small p-value that can be used to reject the +null. ## GPU Compatibility @@ -113,7 +130,9 @@ PyTorch Tensors on the appropriate device. ## Citation -If you use PTED in your work, please include a citation to the [zenodo record](https://doi.org/10.5281/zenodo.15353928) and also see below for references of the underlying method. +If you use PTED in your work, please include a citation to the [zenodo +record](https://doi.org/10.5281/zenodo.15353928) and also see below for +references of the underlying method. ## Reference @@ -132,7 +151,8 @@ I didn't invent this test, I just think its neat. Here is a paper on the subject } ``` -Permutation tests are a whole class of tests, with much literature. Here are some starting points: +Permutation tests are a whole class of tests, with much literature. Here are +some starting points: ``` @book{good2013permutation, diff --git a/src/pted/pted.py b/src/pted/pted.py index 16d05eb..c8c26bf 100644 --- a/src/pted/pted.py +++ b/src/pted/pted.py @@ -3,7 +3,7 @@ from scipy.stats import chi2 as chi2_dist from torch import Tensor -from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy +from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy, two_tailed_p __all__ = ["pted", "pted_coverage_test"] @@ -262,7 +262,9 @@ def pted_coverage_test( return test_stats, permute_stats # Compute p-values + if nsim == 1: + return np.mean(permute_stats > test_stats[0]) pvals = np.mean(permute_stats > test_stats[:, None], axis=1) pvals[pvals == 0] = 1.0 / permutations # handle pvals == 0 - chi2 = -2 * np.log(pvals) - return 1 - chi2_dist.cdf(np.sum(chi2), 2 * nsim) + chi2 = np.sum(-2 * np.log(pvals)) + return two_tailed_p(chi2, 2 * nsim) diff --git a/src/pted/tests.py b/src/pted/tests.py index 563c24e..5adeeef 100644 --- a/src/pted/tests.py +++ b/src/pted/tests.py @@ -49,6 +49,6 @@ def test(): assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" # underconfident p = pted_coverage_test(g, s_under, permutations=200) - assert p > 0.9999, f"p-value {p} is not in the expected range (~1)" + assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" print("Tests passed!") diff --git a/src/pted/utils.py b/src/pted/utils.py index 7a31f91..65c855a 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -3,6 +3,9 @@ import numpy as np from scipy.spatial.distance import cdist import torch +from scipy.stats import chi2 as chi2_dist +from scipy.optimize import root_scalar + __all__ = ["_pted_numpy", "_pted_chunk_numpy", "_pted_torch", "_pted_chunk_torch"] @@ -172,3 +175,37 @@ def _pted_torch( dmatrix = dmatrix[I][:, I] permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny).item()) return test_stat, permute_stats + + +def two_tailed_p(chi2, df): + assert df > 2, "Degrees of freedom must be greater than 2 for two-tailed p-value calculation." + alpha = chi2_dist.pdf(chi2, df) + mode = df - 2 + + if chi2 == mode: + return 1.0 + + def root_eq(x): + return chi2_dist.pdf(x, df) - alpha + + # Find left root + if chi2 < mode: + left = chi2_dist.cdf(chi2, df) + else: + try: + res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq") + left = chi2_dist.cdf(res_left.root, df) + except ValueError: + left = 0.0 # Assume negligible + + # Find right root + if chi2 > mode: + right = chi2_dist.sf(chi2, df) + else: + try: + res_right = root_scalar(root_eq, bracket=[mode, 100 * df], method="brentq") + right = chi2_dist.sf(res_right.root, df) + except ValueError: + right = 0.0 # Assume negligible + + return left + right From ec32caa97598a56aa7faa1d3d291369f91f8818a Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 6 Jun 2025 19:37:54 -0400 Subject: [PATCH 2/5] better coverage and tests --- src/pted/pted.py | 1 + src/pted/utils.py | 8 ++++++-- tests/test_pted.py | 8 ++++++++ tests/test_utils.py | 20 ++++++++++++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 tests/test_utils.py diff --git a/src/pted/pted.py b/src/pted/pted.py index c8c26bf..8a3cd89 100644 --- a/src/pted/pted.py +++ b/src/pted/pted.py @@ -234,6 +234,7 @@ def pted_coverage_test( of permutations. For chunking to be worth it you should have c^2 * I << n^2. """ nsamp, nsim, *_ = s.shape + assert nsim > 0, "need some simulations to run test, got 0 simulations" assert ( g.shape == s.shape[1:] ), f"g and s must have the same shape (past first dim of s), not {g.shape} and {s.shape}" diff --git a/src/pted/utils.py b/src/pted/utils.py index 65c855a..0513097 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -182,7 +182,7 @@ def two_tailed_p(chi2, df): alpha = chi2_dist.pdf(chi2, df) mode = df - 2 - if chi2 == mode: + if np.isclose(chi2, mode): return 1.0 def root_eq(x): @@ -203,7 +203,11 @@ def root_eq(x): right = chi2_dist.sf(chi2, df) else: try: - res_right = root_scalar(root_eq, bracket=[mode, 100 * df], method="brentq") + res_right = root_scalar( + lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq" + ) + if not res_right.converged: + raise ValueError right = chi2_dist.sf(res_right.root, df) except ValueError: right = 0.0 # Assume negligible diff --git a/tests/test_pted.py b/tests/test_pted.py index 01ac54d..8ac8bf8 100644 --- a/tests/test_pted.py +++ b/tests/test_pted.py @@ -87,3 +87,11 @@ def test_pted_chunk_numpy(): y = np.random.uniform(size=(1000, D)) p = pted.pted(x, y, chunk_size=100, chunk_iter=10) assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" + + +def test_pted_coverage_edgecase(): + # Test with single simulation + g = np.random.normal(size=(1, 10)) + s = np.random.normal(size=(100, 1, 10)) + p = pted.pted_coverage_test(g, s) + assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (~1)" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..69be628 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,20 @@ +import numpy as np + +from pted.utils import two_tailed_p + +import pytest + + +def test_two_tailed_p(): + + assert np.isclose(two_tailed_p(4, 6), 1.0), "p-value at mode should be 1.0" + + assert two_tailed_p(0.01, 10) < 0.01, "p-value should be less than 0.01 for small chi2" + assert two_tailed_p(100, 10) < 0.01, "p-value should be less than 0.01 for large chi2" + assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode" + + assert two_tailed_p(0, 10) < 0.01 + assert two_tailed_p(1e10, 5) < 0.01 + + with pytest.raises(AssertionError): + two_tailed_p(4, 2) From b13d4552deca4a66377d9aab131a71e6478802d1 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 6 Jun 2025 19:43:31 -0400 Subject: [PATCH 3/5] attempt better coverage --- src/pted/utils.py | 7 ++----- tests/test_utils.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/pted/utils.py b/src/pted/utils.py index 0513097..b8b60e0 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -192,11 +192,8 @@ def root_eq(x): if chi2 < mode: left = chi2_dist.cdf(chi2, df) else: - try: - res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq") - left = chi2_dist.cdf(res_left.root, df) - except ValueError: - left = 0.0 # Assume negligible + res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq") + left = chi2_dist.cdf(res_left.root, df) # Find right root if chi2 > mode: diff --git a/tests/test_utils.py b/tests/test_utils.py index 69be628..b37f0f5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -14,7 +14,7 @@ def test_two_tailed_p(): assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode" assert two_tailed_p(0, 10) < 0.01 - assert two_tailed_p(1e10, 5) < 0.01 + assert two_tailed_p(1e-10, 100) < 0.01 with pytest.raises(AssertionError): two_tailed_p(4, 2) From 9b4fce3830bef21a7f8c2c3726b02cae7eede857 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 6 Jun 2025 19:47:30 -0400 Subject: [PATCH 4/5] more coverage --- src/pted/utils.py | 2 -- tests/test_utils.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pted/utils.py b/src/pted/utils.py index b8b60e0..f98df07 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -203,8 +203,6 @@ def root_eq(x): res_right = root_scalar( lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq" ) - if not res_right.converged: - raise ValueError right = chi2_dist.sf(res_right.root, df) except ValueError: right = 0.0 # Assume negligible diff --git a/tests/test_utils.py b/tests/test_utils.py index b37f0f5..378648a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -14,7 +14,7 @@ def test_two_tailed_p(): assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode" assert two_tailed_p(0, 10) < 0.01 - assert two_tailed_p(1e-10, 100) < 0.01 + assert two_tailed_p(1e-25, 1000) < 0.01 with pytest.raises(AssertionError): two_tailed_p(4, 2) From ca943b930c55c083808c8c6ab73ef1824bd08a91 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 6 Jun 2025 19:56:54 -0400 Subject: [PATCH 5/5] trusting root finder now --- src/pted/utils.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/pted/utils.py b/src/pted/utils.py index f98df07..74b848a 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -199,12 +199,7 @@ def root_eq(x): if chi2 > mode: right = chi2_dist.sf(chi2, df) else: - try: - res_right = root_scalar( - lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq" - ) - right = chi2_dist.sf(res_right.root, df) - except ValueError: - right = 0.0 # Assume negligible + res_right = root_scalar(root_eq, bracket=[mode, 10000 * df], method="brentq") + right = chi2_dist.sf(res_right.root, df) return left + right