From 21f0aee9d765a026ffd685332f42212701d1ea90 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Thu, 12 Jun 2025 21:20:15 -0400 Subject: [PATCH 1/2] check over under confident --- README.md | 41 +++++++++++++++++++++++++++++++++++++++++ src/pted/pted.py | 12 +++++++++++- src/pted/utils.py | 20 +++++++++++++++++++- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 253b47d..d18010d 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,47 @@ test on this chi2 distribution meaning that if your posterior is underconfident or overconfident, you will get a small p-value that can be used to reject the null. +## Interpreting the results + +### Two sample test + +This is a null hypothesis test, thus we are specifically asking the question: +"if `x` and `y` were drawn from the same distribution, how likely am I to have +observed an energy distance as extreme as this?" This is fundamentally different +from the question "how likely is it that `x` and `y` were drawn from the same +distribution?" Which is really what we would like to ask, but I am unaware of +how we would do that in a meaningful way. It is also important to note that we +are specifically looking at extreme energy distances, so we are not even really +talking about the probability densities directly. If there was a transformation +between `x` and `y` that the energy distance was insensitive to, then the two +distributions could potentially be arbitrarily different without PTED +identifying it. For example, since the default energy distance is computed with +the Euclidean distance, a single dimension in which the values are orders of +magnitude larger than the others could make it so that all other dimensions are +ignored and could be very different. For this reason we suggest using the metric +`mahalanobis` if this is a potential issue in your data. + +### Coverage Test + +For the coverage test we apply the PTED two sample test to each simulation +separately. We then combine the resulting p-values using chi squared where the +resulting degrees of freedom is 2 times the number of simulations. Because of +this, we can detect underconfidence or overconfidence. Underconfidence is when +the posterior distribution is too large, it covers the ground truth by spreading +too thin and not fully exploiting the information in the prior/likelihood of the +posterior sampling process. Sometimes this is acceptable, for example when using +Approximate Bayesian Computing one expects the posterior to be at least slightly +underconfident. Overconfidence is when the posterior is too narrow and so the +ground truth appears as an extreme outlier from its perspective. This can occur +in two main ways, one is by poorly exploring the posterior, for example simply +seeking the maximum a-posteriori is always overconfident since it is a delta +function (unless you get lucky and land on the ground truth). Another way is if +your posterior is biased, you may have an appropriately broad posterior, but it +is in the wrong part of your parameter space. PTED has no way to distinguish +these modes of overconfidence, however just knowing under/over-confidence can be +powerful. As such, by default the PTED coverage test will warn users as to which +kind of failure mode they are in if the `warn_coverage` parameter is not `None`. + ## GPU Compatibility PTED works on both CPU and GPU. All that is needed is to pass the `x` and `y` as diff --git a/src/pted/pted.py b/src/pted/pted.py index 8a3cd89..c456039 100644 --- a/src/pted/pted.py +++ b/src/pted/pted.py @@ -3,7 +3,14 @@ from scipy.stats import chi2 as chi2_dist from torch import Tensor -from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy, two_tailed_p +from .utils import ( + _pted_torch, + _pted_numpy, + _pted_chunk_torch, + _pted_chunk_numpy, + two_tailed_p, + confidence_alert, +) __all__ = ["pted", "pted_coverage_test"] @@ -147,6 +154,7 @@ def pted_coverage_test( s: Union[np.ndarray, Tensor], permutations: int = 1000, metric: str = "euclidean", + warn_confidence: Optional[float] = 1e-3, return_all: bool = False, chunk_size: Optional[int] = None, chunk_iter: Optional[int] = None, @@ -268,4 +276,6 @@ def pted_coverage_test( pvals = np.mean(permute_stats > test_stats[:, None], axis=1) pvals[pvals == 0] = 1.0 / permutations # handle pvals == 0 chi2 = np.sum(-2 * np.log(pvals)) + if warn_confidence is not None: + confidence_alert(chi2, 2 * nsim, warn_confidence) return two_tailed_p(chi2, 2 * nsim) diff --git a/src/pted/utils.py b/src/pted/utils.py index 74b848a..9777866 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -1,4 +1,5 @@ -from typing import Optional, Union +from typing import Union +from warnings import warn import numpy as np from scipy.spatial.distance import cdist @@ -203,3 +204,20 @@ def root_eq(x): right = chi2_dist.sf(res_right.root, df) return left + right + + +def confidence_alert(chi2, df, level): + + left_tail = chi2_dist.cdf(chi2, df) + right_tail = chi2_dist.sf(chi2, df) + + if left_tail < level: + warn( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates underconfidence (left tail p-value {left_tail:.2e} < {level:.2e}).", + UserWarning, + ) + elif right_tail < level: + warn( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates overconfidence (right tail p-value {right_tail:.2e} < {level:.2e}).", + UserWarning, + ) From 4d55ce733789a5ed6c0c0025cb1cdbb977c54b24 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 13 Jun 2025 11:05:46 -0400 Subject: [PATCH 2/2] add tests --- README.md | 2 +- src/pted/tests.py | 4 ++-- src/pted/utils.py | 18 ++++++++++++++---- tests/test_pted.py | 11 +++++++++++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d18010d..1cf8f06 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ your posterior is biased, you may have an appropriately broad posterior, but it is in the wrong part of your parameter space. PTED has no way to distinguish these modes of overconfidence, however just knowing under/over-confidence can be powerful. As such, by default the PTED coverage test will warn users as to which -kind of failure mode they are in if the `warn_coverage` parameter is not `None`. +kind of failure mode they are in if the `warn_confidence` parameter is not `None`. ## GPU Compatibility diff --git a/src/pted/tests.py b/src/pted/tests.py index 5adeeef..6b59c24 100644 --- a/src/pted/tests.py +++ b/src/pted/tests.py @@ -45,10 +45,10 @@ def test(): p = pted_coverage_test(g, s_corr, permutations=200) assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (U(0,1))" # overconfident - p = pted_coverage_test(g, s_over, permutations=200) + p = pted_coverage_test(g, s_over, permutations=200, warn_confidence=None) assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" # underconfident - p = pted_coverage_test(g, s_under, permutations=200) + p = pted_coverage_test(g, s_under, permutations=200, warn_confidence=None) assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" print("Tests passed!") diff --git a/src/pted/utils.py b/src/pted/utils.py index 9777866..8fe9155 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -206,6 +206,14 @@ def root_eq(x): return left + right +class OverconfidenceWarning(UserWarning): + """Warning for overconfidence in chi-squared test results.""" + + +class UnderconfidenceWarning(UserWarning): + """Warning for underconfidence in chi-squared test results.""" + + def confidence_alert(chi2, df, level): left_tail = chi2_dist.cdf(chi2, df) @@ -213,11 +221,13 @@ def confidence_alert(chi2, df, level): if left_tail < level: warn( - f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates underconfidence (left tail p-value {left_tail:.2e} < {level:.2e}).", - UserWarning, + UnderconfidenceWarning( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates underconfidence (left tail p-value {left_tail:.2e} < {level:.2e})." + ) ) elif right_tail < level: warn( - f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates overconfidence (right tail p-value {right_tail:.2e} < {level:.2e}).", - UserWarning, + OverconfidenceWarning( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates overconfidence (right tail p-value {right_tail:.2e} < {level:.2e})." + ) ) diff --git a/tests/test_pted.py b/tests/test_pted.py index 8ac8bf8..fb6e4ca 100644 --- a/tests/test_pted.py +++ b/tests/test_pted.py @@ -2,6 +2,8 @@ import torch import numpy as np +import pytest + def test_inputs_extra_dims(): np.random.seed(42) @@ -95,3 +97,12 @@ def test_pted_coverage_edgecase(): s = np.random.normal(size=(100, 1, 10)) p = pted.pted_coverage_test(g, s) assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (~1)" + + +def test_pted_coverage_overunder(): + g = torch.randn(100, 3) + s = torch.randn(50, 100, 3) + with pytest.warns(pted.utils.OverconfidenceWarning): + p = pted.pted_coverage_test(g, s * 0.5) + with pytest.warns(pted.utils.UnderconfidenceWarning): + p = pted.pted_coverage_test(g, s * 2)