diff --git a/README.md b/README.md index 253b47d..1cf8f06 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,47 @@ test on this chi2 distribution meaning that if your posterior is underconfident or overconfident, you will get a small p-value that can be used to reject the null. +## Interpreting the results + +### Two sample test + +This is a null hypothesis test, thus we are specifically asking the question: +"if `x` and `y` were drawn from the same distribution, how likely am I to have +observed an energy distance as extreme as this?" This is fundamentally different +from the question "how likely is it that `x` and `y` were drawn from the same +distribution?" Which is really what we would like to ask, but I am unaware of +how we would do that in a meaningful way. It is also important to note that we +are specifically looking at extreme energy distances, so we are not even really +talking about the probability densities directly. If there was a transformation +between `x` and `y` that the energy distance was insensitive to, then the two +distributions could potentially be arbitrarily different without PTED +identifying it. For example, since the default energy distance is computed with +the Euclidean distance, a single dimension in which the values are orders of +magnitude larger than the others could make it so that all other dimensions are +ignored and could be very different. For this reason we suggest using the metric +`mahalanobis` if this is a potential issue in your data. + +### Coverage Test + +For the coverage test we apply the PTED two sample test to each simulation +separately. We then combine the resulting p-values using chi squared where the +resulting degrees of freedom is 2 times the number of simulations. Because of +this, we can detect underconfidence or overconfidence. Underconfidence is when +the posterior distribution is too large, it covers the ground truth by spreading +too thin and not fully exploiting the information in the prior/likelihood of the +posterior sampling process. Sometimes this is acceptable, for example when using +Approximate Bayesian Computing one expects the posterior to be at least slightly +underconfident. Overconfidence is when the posterior is too narrow and so the +ground truth appears as an extreme outlier from its perspective. This can occur +in two main ways, one is by poorly exploring the posterior, for example simply +seeking the maximum a-posteriori is always overconfident since it is a delta +function (unless you get lucky and land on the ground truth). Another way is if +your posterior is biased, you may have an appropriately broad posterior, but it +is in the wrong part of your parameter space. PTED has no way to distinguish +these modes of overconfidence, however just knowing under/over-confidence can be +powerful. As such, by default the PTED coverage test will warn users as to which +kind of failure mode they are in if the `warn_confidence` parameter is not `None`. + ## GPU Compatibility PTED works on both CPU and GPU. All that is needed is to pass the `x` and `y` as diff --git a/src/pted/pted.py b/src/pted/pted.py index 8a3cd89..c456039 100644 --- a/src/pted/pted.py +++ b/src/pted/pted.py @@ -3,7 +3,14 @@ from scipy.stats import chi2 as chi2_dist from torch import Tensor -from .utils import _pted_torch, _pted_numpy, _pted_chunk_torch, _pted_chunk_numpy, two_tailed_p +from .utils import ( + _pted_torch, + _pted_numpy, + _pted_chunk_torch, + _pted_chunk_numpy, + two_tailed_p, + confidence_alert, +) __all__ = ["pted", "pted_coverage_test"] @@ -147,6 +154,7 @@ def pted_coverage_test( s: Union[np.ndarray, Tensor], permutations: int = 1000, metric: str = "euclidean", + warn_confidence: Optional[float] = 1e-3, return_all: bool = False, chunk_size: Optional[int] = None, chunk_iter: Optional[int] = None, @@ -268,4 +276,6 @@ def pted_coverage_test( pvals = np.mean(permute_stats > test_stats[:, None], axis=1) pvals[pvals == 0] = 1.0 / permutations # handle pvals == 0 chi2 = np.sum(-2 * np.log(pvals)) + if warn_confidence is not None: + confidence_alert(chi2, 2 * nsim, warn_confidence) return two_tailed_p(chi2, 2 * nsim) diff --git a/src/pted/tests.py b/src/pted/tests.py index 5adeeef..6b59c24 100644 --- a/src/pted/tests.py +++ b/src/pted/tests.py @@ -45,10 +45,10 @@ def test(): p = pted_coverage_test(g, s_corr, permutations=200) assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (U(0,1))" # overconfident - p = pted_coverage_test(g, s_over, permutations=200) + p = pted_coverage_test(g, s_over, permutations=200, warn_confidence=None) assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" # underconfident - p = pted_coverage_test(g, s_under, permutations=200) + p = pted_coverage_test(g, s_under, permutations=200, warn_confidence=None) assert p < 1e-4, f"p-value {p} is not in the expected range (~0)" print("Tests passed!") diff --git a/src/pted/utils.py b/src/pted/utils.py index 74b848a..8fe9155 100644 --- a/src/pted/utils.py +++ b/src/pted/utils.py @@ -1,4 +1,5 @@ -from typing import Optional, Union +from typing import Union +from warnings import warn import numpy as np from scipy.spatial.distance import cdist @@ -203,3 +204,30 @@ def root_eq(x): right = chi2_dist.sf(res_right.root, df) return left + right + + +class OverconfidenceWarning(UserWarning): + """Warning for overconfidence in chi-squared test results.""" + + +class UnderconfidenceWarning(UserWarning): + """Warning for underconfidence in chi-squared test results.""" + + +def confidence_alert(chi2, df, level): + + left_tail = chi2_dist.cdf(chi2, df) + right_tail = chi2_dist.sf(chi2, df) + + if left_tail < level: + warn( + UnderconfidenceWarning( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates underconfidence (left tail p-value {left_tail:.2e} < {level:.2e})." + ) + ) + elif right_tail < level: + warn( + OverconfidenceWarning( + f"Chi^2 of {chi2:.2e} for degrees of freedom {df} indicates overconfidence (right tail p-value {right_tail:.2e} < {level:.2e})." + ) + ) diff --git a/tests/test_pted.py b/tests/test_pted.py index 8ac8bf8..fb6e4ca 100644 --- a/tests/test_pted.py +++ b/tests/test_pted.py @@ -2,6 +2,8 @@ import torch import numpy as np +import pytest + def test_inputs_extra_dims(): np.random.seed(42) @@ -95,3 +97,12 @@ def test_pted_coverage_edgecase(): s = np.random.normal(size=(100, 1, 10)) p = pted.pted_coverage_test(g, s) assert p > 1e-4 and p < 0.9999, f"p-value {p} is not in the expected range (~1)" + + +def test_pted_coverage_overunder(): + g = torch.randn(100, 3) + s = torch.randn(50, 100, 3) + with pytest.warns(pted.utils.OverconfidenceWarning): + p = pted.pted_coverage_test(g, s * 0.5) + with pytest.warns(pted.utils.UnderconfidenceWarning): + p = pted.pted_coverage_test(g, s * 2)