ConnorStoneAstro · ConnorStoneAstro · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/README.md b/README.md
@@ -123,6 +123,47 @@ test on this chi2 distribution meaning that if your posterior is underconfident
 or overconfident, you will get a small p-value that can be used to reject the
 null.
 
+## Interpreting the results
+
+### Two sample test
+
+This is a null hypothesis test, thus we are specifically asking the question:
+"if `x` and `y` were drawn from the same distribution, how likely am I to have
+observed an energy distance as extreme as this?" This is fundamentally different
+from the question "how likely is it that `x` and `y` were drawn from the same
+distribution?" Which is really what we would like to ask, but I am unaware of
+how we would do that in a meaningful way. It is also important to note that we
+are specifically looking at extreme energy distances, so we are not even really
+talking about the probability densities directly. If there was a transformation
+between `x` and `y` that the energy distance was insensitive to, then the two
+distributions could potentially be arbitrarily different without PTED
+identifying it. For example, since the default energy distance is computed with
+the Euclidean distance, a single dimension in which the values are orders of
+magnitude larger than the others could make it so that all other dimensions are
+ignored and could be very different. For this reason we suggest using the metric
+`mahalanobis` if this is a potential issue in your data.
+
+### Coverage Test
+
+For the coverage test we apply the PTED two sample test to each simulation
+separately. We then combine the resulting p-values using chi squared where the
+resulting degrees of freedom is 2 times the number of simulations. Because of
+this, we can detect underconfidence or overconfidence. Underconfidence is when
+the posterior distribution is too large, it covers the ground truth by spreading
+too thin and not fully exploiting the information in the prior/likelihood of the
+posterior sampling process. Sometimes this is acceptable, for example when using
+Approximate Bayesian Computing one expects the posterior to be at least slightly
+underconfident. Overconfidence is when the posterior is too narrow and so the
+ground truth appears as an extreme outlier from its perspective. This can occur
+in two main ways, one is by poorly exploring the posterior, for example simply
+seeking the maximum a-posteriori is always overconfident since it is a delta
+function (unless you get lucky and land on the ground truth). Another way is if
+your posterior is biased, you may have an appropriately broad posterior, but it
+is in the wrong part of your parameter space. PTED has no way to distinguish
+these modes of overconfidence, however just knowing under/over-confidence can be
+powerful. As such, by default the PTED coverage test will warn users as to which
+kind of failure mode they are in if the `warn_confidence` parameter is not `None`.
+
 ## GPU Compatibility
 
 PTED works on both CPU and GPU. All that is needed is to pass the `x` and `y` as

diff --git a/src/pted/pted.py b/src/pted/pted.py
@@ -3,7 +3,14 @@
 from scipy.stats import chi2 as chi2_dist
 from torch import Tensor
 
-from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy, two_tailed_p
+from .utils import (
+    _pted_torch,
+    _pted_numpy,
+    _pted_chunk_torch,
+    _pted_chunk_numpy,
+    two_tailed_p,
+    confidence_alert,
+)
 
 __all__ = ["pted", "pted_coverage_test"]
 
@@ -147,6 +154,7 @@ def pted_coverage_test(
     s: Union[np.ndarray, Tensor],
     permutations: int = 1000,
     metric: str = "euclidean",
+    warn_confidence: Optional[float] = 1e-3,
     return_all: bool = False,
     chunk_size: Optional[int] = None,
     chunk_iter: Optional[int] = None,
@@ -268,4 +276,6 @@ def pted_coverage_test(
     pvals = np.mean(permute_stats > test_stats[:, None], axis=1)
     pvals[pvals == 0] = 1.0 / permutations  # handle pvals == 0
     chi2 = np.sum(-2 * np.log(pvals))
+    if warn_confidence is not None:
+        confidence_alert(chi2, 2 * nsim, warn_confidence)
     return two_tailed_p(chi2, 2 * nsim)
diff --git a/src/pted/tests.py b/src/pted/tests.py
@@ -45,10 +45,10 @@ def test():
     p = pted_coverage_test(g, s_corr, permutations=200)
     assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (U(0,1))"
     # overconfident
-    p = pted_coverage_test(g, s_over, permutations=200)
+    p = pted_coverage_test(g, s_over, permutations=200, warn_confidence=None)
     assert p < 1e-4, f"p-value {p} is not in the expected range (~0)"
     # underconfident
-    p = pted_coverage_test(g, s_under, permutations=200)
+    p = pted_coverage_test(g, s_under, permutations=200, warn_confidence=None)
     assert p < 1e-4, f"p-value {p} is not in the expected range (~0)"
 
     print("Tests passed!")
diff --git a/src/pted/utils.py b/src/pted/utils.py
@@ -1,4 +1,5 @@
-from typing import Optional, Union
+from typing import Union
+from warnings import warn
 
 import numpy as np
 from scipy.spatial.distance import cdist
@@ -203,3 +204,30 @@ def root_eq(x):
         right = chi2_dist.sf(res_right.root, df)
 
     return left + right
+
+
+class OverconfidenceWarning(UserWarning):
+    """Warning for overconfidence in chi-squared test results."""
+
+
+class UnderconfidenceWarning(UserWarning):
+    """Warning for underconfidence in chi-squared test results."""
+
+
+def confidence_alert(chi2, df, level):
+
+    left_tail = chi2_dist.cdf(chi2, df)
+    right_tail = chi2_dist.sf(chi2, df)
+
+    if left_tail < level:
+        warn(
+            UnderconfidenceWarning(
+                f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates underconfidence (left tail p-value {left_tail:.2e} < {level:.2e})."
+            )
+        )
+    elif right_tail < level:
+        warn(
+            OverconfidenceWarning(
+                f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates overconfidence (right tail p-value {right_tail:.2e} < {level:.2e})."
+            )
+        )
diff --git a/tests/test_pted.py b/tests/test_pted.py
@@ -2,6 +2,8 @@
 import torch
 import numpy as np
 
+import pytest
+
 
 def test_inputs_extra_dims():
     np.random.seed(42)
@@ -95,3 +97,12 @@ def test_pted_coverage_edgecase():
     s = np.random.normal(size=(100, 1, 10))
     p = pted.pted_coverage_test(g, s)
     assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (~1)"
+
+
+def test_pted_coverage_overunder():
+    g = torch.randn(100, 3)
+    s = torch.randn(50, 100, 3)
+    with pytest.warns(pted.utils.OverconfidenceWarning):
+        p = pted.pted_coverage_test(g, s * 0.5)
+    with pytest.warns(pted.utils.UnderconfidenceWarning):
+        p = pted.pted_coverage_test(g, s * 2)