From 221dbfbf6d47724ce7cf48b56d732c432e4d7faa Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 6 Jun 2025 18:14:04 -0400
Subject: [PATCH 1/5] change coverage test to two tail p-value

---
 README.md         | 24 ++++++++++++++++++++++--
 src/pted/pted.py  |  8 +++++---
 src/pted/tests.py |  2 +-
 src/pted/utils.py | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 1c26424..253b47d 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,8 @@ print(f"p-value: {p_value:.3f}") # expect uniform random from 0-1
 
 ## How it works
 
+### Two sample test
+
 PTED uses the energy distance of the two samples `x` and `y`, this is computed as:
 
 $$d = \frac{2}{n_xn_y}\sum_{i,j}||x_i - y_j|| - \frac{1}{n_x^2}\sum_{i,j}||x_i - x_j|| - \frac{1}{n_y^2}\sum_{i,j}||y_i - y_j||$$
@@ -105,6 +107,21 @@ grey distribution is just a histogram of many re-runs of this procedure. We
 compute a p-value by taking the fraction of the energy distances that are
 greater than the current one.
 
+### Coverage test
+
+In the coverage test we have some number of simulations `nsim` where there is a
+true value `g` and some posterior samples `s`. For each simulation separately we
+use PTED to compute a p-value, essentially asking the question "was `g` drawn
+from the distribution that generated `s`?". Individually, these tests are not
+especially informative, however their p-values must have been drawn from
+`U(0,1)` under the null-hypothesis. Thus we just need a way to combine their
+statistical power. It turns out that for some `p ~ U(0,1)` value, we have that
+`- 2 ln(p)` is chi2 distributed with `dof = 2`. This means that we can sum the
+chi2 values for the PTED test on each simulation and compare with a chi2
+distribution with `dof = 2 * nsim`. We use a density based two tailed p-value
+test on this chi2 distribution meaning that if your posterior is underconfident
+or overconfident, you will get a small p-value that can be used to reject the
+null.
 
 ## GPU Compatibility
 
@@ -113,7 +130,9 @@ PyTorch Tensors on the appropriate device.
 
 ## Citation
 
-If you use PTED in your work, please include a citation to the [zenodo record](https://doi.org/10.5281/zenodo.15353928) and also see below for references of the underlying method.
+If you use PTED in your work, please include a citation to the [zenodo
+record](https://doi.org/10.5281/zenodo.15353928) and also see below for
+references of the underlying method.
 
 ## Reference
 
@@ -132,7 +151,8 @@ I didn't invent this test, I just think its neat. Here is a paper on the subject
 }
 ```
 
-Permutation tests are a whole class of tests, with much literature. Here are some starting points:
+Permutation tests are a whole class of tests, with much literature. Here are
+some starting points:
 
 ```
 @book{good2013permutation,
diff --git a/src/pted/pted.py b/src/pted/pted.py
index 16d05eb..c8c26bf 100644
--- a/src/pted/pted.py
+++ b/src/pted/pted.py
@@ -3,7 +3,7 @@
 from scipy.stats import chi2 as chi2_dist
 from torch import Tensor
 
-from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy
+from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy, two_tailed_p
 
 __all__ = ["pted", "pted_coverage_test"]
 
@@ -262,7 +262,9 @@ def pted_coverage_test(
         return test_stats, permute_stats
 
     # Compute p-values
+    if nsim == 1:
+        return np.mean(permute_stats > test_stats[0])
     pvals = np.mean(permute_stats > test_stats[:, None], axis=1)
     pvals[pvals == 0] = 1.0 / permutations  # handle pvals == 0
-    chi2 = -2 * np.log(pvals)
-    return 1 - chi2_dist.cdf(np.sum(chi2), 2 * nsim)
+    chi2 = np.sum(-2 * np.log(pvals))
+    return two_tailed_p(chi2, 2 * nsim)
diff --git a/src/pted/tests.py b/src/pted/tests.py
index 563c24e..5adeeef 100644
--- a/src/pted/tests.py
+++ b/src/pted/tests.py
@@ -49,6 +49,6 @@ def test():
     assert p < 1e-4, f"p-value {p} is not in the expected range (~0)"
     # underconfident
     p = pted_coverage_test(g, s_under, permutations=200)
-    assert p > 0.9999, f"p-value {p} is not in the expected range (~1)"
+    assert p < 1e-4, f"p-value {p} is not in the expected range (~0)"
 
     print("Tests passed!")
diff --git a/src/pted/utils.py b/src/pted/utils.py
index 7a31f91..65c855a 100644
--- a/src/pted/utils.py
+++ b/src/pted/utils.py
@@ -3,6 +3,9 @@
 import numpy as np
 from scipy.spatial.distance import cdist
 import torch
+from scipy.stats import chi2 as chi2_dist
+from scipy.optimize import root_scalar
+
 
 __all__ = ["_pted_numpy", "_pted_chunk_numpy", "_pted_torch", "_pted_chunk_torch"]
 
@@ -172,3 +175,37 @@ def _pted_torch(
         dmatrix = dmatrix[I][:, I]
         permute_stats.append(_energy_distance_precompute(dmatrix, nx, ny).item())
     return test_stat, permute_stats
+
+
+def two_tailed_p(chi2, df):
+    assert df > 2, "Degrees of freedom must be greater than 2 for two-tailed p-value calculation."
+    alpha = chi2_dist.pdf(chi2, df)
+    mode = df - 2
+
+    if chi2 == mode:
+        return 1.0
+
+    def root_eq(x):
+        return chi2_dist.pdf(x, df) - alpha
+
+    # Find left root
+    if chi2 < mode:
+        left = chi2_dist.cdf(chi2, df)
+    else:
+        try:
+            res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq")
+            left = chi2_dist.cdf(res_left.root, df)
+        except ValueError:
+            left = 0.0  # Assume negligible
+
+    # Find right root
+    if chi2 > mode:
+        right = chi2_dist.sf(chi2, df)
+    else:
+        try:
+            res_right = root_scalar(root_eq, bracket=[mode, 100 * df], method="brentq")
+            right = chi2_dist.sf(res_right.root, df)
+        except ValueError:
+            right = 0.0  # Assume negligible
+
+    return left + right

From ec32caa97598a56aa7faa1d3d291369f91f8818a Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 6 Jun 2025 19:37:54 -0400
Subject: [PATCH 2/5] better coverage and tests

---
 src/pted/pted.py    |  1 +
 src/pted/utils.py   |  8 ++++++--
 tests/test_pted.py  |  8 ++++++++
 tests/test_utils.py | 20 ++++++++++++++++++++
 4 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_utils.py

diff --git a/src/pted/pted.py b/src/pted/pted.py
index c8c26bf..8a3cd89 100644
--- a/src/pted/pted.py
+++ b/src/pted/pted.py
@@ -234,6 +234,7 @@ def pted_coverage_test(
         of permutations. For chunking to be worth it you should have c^2 * I << n^2.
     """
     nsamp, nsim, *_ = s.shape
+    assert nsim > 0, "need some simulations to run test, got 0 simulations"
     assert (
         g.shape == s.shape[1:]
     ), f"g and s must have the same shape (past first dim of s), not {g.shape} and {s.shape}"
diff --git a/src/pted/utils.py b/src/pted/utils.py
index 65c855a..0513097 100644
--- a/src/pted/utils.py
+++ b/src/pted/utils.py
@@ -182,7 +182,7 @@ def two_tailed_p(chi2, df):
     alpha = chi2_dist.pdf(chi2, df)
     mode = df - 2
 
-    if chi2 == mode:
+    if np.isclose(chi2, mode):
         return 1.0
 
     def root_eq(x):
@@ -203,7 +203,11 @@ def root_eq(x):
         right = chi2_dist.sf(chi2, df)
     else:
         try:
-            res_right = root_scalar(root_eq, bracket=[mode, 100 * df], method="brentq")
+            res_right = root_scalar(
+                lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq"
+            )
+            if not res_right.converged:
+                raise ValueError
             right = chi2_dist.sf(res_right.root, df)
         except ValueError:
             right = 0.0  # Assume negligible
diff --git a/tests/test_pted.py b/tests/test_pted.py
index 01ac54d..8ac8bf8 100644
--- a/tests/test_pted.py
+++ b/tests/test_pted.py
@@ -87,3 +87,11 @@ def test_pted_chunk_numpy():
     y = np.random.uniform(size=(1000, D))
     p = pted.pted(x, y, chunk_size=100, chunk_iter=10)
     assert p < 1e-4, f"p-value {p} is not in the expected range (~0)"
+
+
+def test_pted_coverage_edgecase():
+    # Test with single simulation
+    g = np.random.normal(size=(1, 10))
+    s = np.random.normal(size=(100, 1, 10))
+    p = pted.pted_coverage_test(g, s)
+    assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (~1)"
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..69be628
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+from pted.utils import two_tailed_p
+
+import pytest
+
+
+def test_two_tailed_p():
+
+    assert np.isclose(two_tailed_p(4, 6), 1.0), "p-value at mode should be 1.0"
+
+    assert two_tailed_p(0.01, 10) < 0.01, "p-value should be less than 0.01 for small chi2"
+    assert two_tailed_p(100, 10) < 0.01, "p-value should be less than 0.01 for large chi2"
+    assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode"
+
+    assert two_tailed_p(0, 10) < 0.01
+    assert two_tailed_p(1e10, 5) < 0.01
+
+    with pytest.raises(AssertionError):
+        two_tailed_p(4, 2)

From b13d4552deca4a66377d9aab131a71e6478802d1 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 6 Jun 2025 19:43:31 -0400
Subject: [PATCH 3/5] attempt better coverage

---
 src/pted/utils.py   | 7 ++-----
 tests/test_utils.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/pted/utils.py b/src/pted/utils.py
index 0513097..b8b60e0 100644
--- a/src/pted/utils.py
+++ b/src/pted/utils.py
@@ -192,11 +192,8 @@ def root_eq(x):
     if chi2 < mode:
         left = chi2_dist.cdf(chi2, df)
     else:
-        try:
-            res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq")
-            left = chi2_dist.cdf(res_left.root, df)
-        except ValueError:
-            left = 0.0  # Assume negligible
+        res_left = root_scalar(root_eq, bracket=[0, mode], method="brentq")
+        left = chi2_dist.cdf(res_left.root, df)
 
     # Find right root
     if chi2 > mode:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 69be628..b37f0f5 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,7 +14,7 @@ def test_two_tailed_p():
     assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode"
 
     assert two_tailed_p(0, 10) < 0.01
-    assert two_tailed_p(1e10, 5) < 0.01
+    assert two_tailed_p(1e-10, 100) < 0.01
 
     with pytest.raises(AssertionError):
         two_tailed_p(4, 2)

From 9b4fce3830bef21a7f8c2c3726b02cae7eede857 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 6 Jun 2025 19:47:30 -0400
Subject: [PATCH 4/5] more coverage

---
 src/pted/utils.py   | 2 --
 tests/test_utils.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pted/utils.py b/src/pted/utils.py
index b8b60e0..f98df07 100644
--- a/src/pted/utils.py
+++ b/src/pted/utils.py
@@ -203,8 +203,6 @@ def root_eq(x):
             res_right = root_scalar(
                 lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq"
             )
-            if not res_right.converged:
-                raise ValueError
             right = chi2_dist.sf(res_right.root, df)
         except ValueError:
             right = 0.0  # Assume negligible
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b37f0f5..378648a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,7 +14,7 @@ def test_two_tailed_p():
     assert two_tailed_p(10, 10) > 0.01, "p-value should be close to 0.5 for chi2 near mode"
 
     assert two_tailed_p(0, 10) < 0.01
-    assert two_tailed_p(1e-10, 100) < 0.01
+    assert two_tailed_p(1e-25, 1000) < 0.01
 
     with pytest.raises(AssertionError):
         two_tailed_p(4, 2)

From ca943b930c55c083808c8c6ab73ef1824bd08a91 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 6 Jun 2025 19:56:54 -0400
Subject: [PATCH 5/5] trusting root finder now

---
 src/pted/utils.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/pted/utils.py b/src/pted/utils.py
index f98df07..74b848a 100644
--- a/src/pted/utils.py
+++ b/src/pted/utils.py
@@ -199,12 +199,7 @@ def root_eq(x):
     if chi2 > mode:
         right = chi2_dist.sf(chi2, df)
     else:
-        try:
-            res_right = root_scalar(
-                lambda x: root_eq(x), bracket=[mode, 1000 * df], method="brentq"
-            )
-            right = chi2_dist.sf(res_right.root, df)
-        except ValueError:
-            right = 0.0  # Assume negligible
+        res_right = root_scalar(root_eq, bracket=[mode, 10000 * df], method="brentq")
+        right = chi2_dist.sf(res_right.root, df)
 
     return left + right