openshift · dgoodwin · Jan 14, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/go.mod b/go.mod
@@ -161,7 +161,7 @@ require (
 	go4.org v0.0.0-20201209231011-d4a079459e60 // indirect
 	gocloud.dev v0.40.0 // indirect
 	golang.org/x/crypto v0.26.0 // indirect
-	golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
+	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa // indirect
 	golang.org/x/mod v0.18.0 // indirect
 	golang.org/x/net v0.28.0 // indirect
 	golang.org/x/sync v0.8.0 // indirect
@@ -172,6 +172,7 @@ require (
 	golang.org/x/tools v0.22.0 // indirect
 	golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
+	gonum.org/v1/gonum v0.15.1 // indirect
 	google.golang.org/genproto v0.0.0-20240812133136-8ffd90a71988 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20240812133136-8ffd90a71988 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240812133136-8ffd90a71988 // indirect

diff --git a/go.sum b/go.sum
@@ -721,6 +721,8 @@ golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EH
 golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
 golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
 golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -1000,6 +1002,8 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw
 gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
 gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o=
 gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY=
+gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
+gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
 google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
 google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
 google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=

diff --git a/pkg/util/bayes/negative_binomial.R b/pkg/util/bayes/negative_binomial.R
@@ -0,0 +1,94 @@
+# Notes:
+# Rscript <filename> will run this.  It only uses the standard library.
+
+# To detect problems faster can:
+# * We can use smaller priors so new data has a stronger influence.
+# * Calculate posterior predictive distributions to detect deviations from what we expect.
+
+# This is where we mix in the historical and recent data.
+posterior_params <- function(alpha_prior, beta_prior, k_recent, n_recent) {
+  alpha_posterior <- alpha_prior + k_recent
+  beta_posterior <- beta_prior + n_recent
+  return(list(alpha = alpha_posterior, beta = beta_posterior))
+}
+
+posterior_predictive <- function(alpha_posterior, beta_posterior, k, n) {
+  prob <- dnbinom(x = k, size = alpha_posterior, prob = beta_posterior / (beta_posterior + n))
+  return(prob)
+}
+
+analyze_scenario <- function(alpha_prior, beta_prior, k_recent, n_recent, k_pr, n_pr) {
+  posterior <- posterior_params(alpha_prior, beta_prior, k_recent, n_recent)
+  prob <- posterior_predictive(posterior$alpha, posterior$beta, k_pr, n_pr)
+
+  cat("--- Results ---\n")
+  cat("Posterior Mean Failure Rate:", posterior$alpha / posterior$beta, "\n")
+  cat("Posterior Predictive Probability of", k_pr, "failures in", n_pr, "tests:", prob, "\n\n")
+}
+
+#alpha_prior = Historical failures
+#beta_prior = Historical total tests
+#k_recent = Recent failures
+#n_recent = Recent total tests
+#k_pr = Pull request failures
+#n_pr = Pull request total tests
+print("Significant historical, limited mixed results in PR, possible on-going issue")
+analyze_scenario(
+  alpha_prior = 10,
+  beta_prior = 1000,
+  k_recent = 7,
+  n_recent = 27,
+  k_pr = 1,
+  n_pr = 2
+)
+
+print("Limited historical, limited mixed results in PR")
+analyze_scenario(
+  alpha_prior = 1,
+  beta_prior = 30,
+  k_recent = 0,
+  n_recent = 5,
+  k_pr = 1,
+  n_pr = 2
+)
+
+print("Limited historical, unlikely regression in PR")
+analyze_scenario(
+  alpha_prior = 1,
+  beta_prior = 30,
+  k_recent = 0,
+  n_recent = 20,
+  k_pr = 1,
+  n_pr = 10
+)
+
+print("Limited historical, obvious regression in PR")
+analyze_scenario(
+  alpha_prior = 1,
+  beta_prior = 30,
+  k_recent = 0,
+  n_recent = 20,
+  k_pr = 10,
+  n_pr = 15
+)
+
+print("Strong high pass rate historical data, but this test is failing outside our PR in recent runs")
+analyze_scenario(
+  alpha_prior = 0,
+  beta_prior = 1000,
+  k_recent = 20,
+  n_recent = 30,
+  k_pr = 1,
+  n_pr = 3
+)
+
+# https://sippy.dptools.openshift.org/sippy-ng/component_readiness/test_details?Aggregation=none&Architecture=amd64&Architecture=amd64&FeatureSet=default&FeatureSet=default&Installer=ipi&Installer=ipi&LayeredProduct=none&Network=ovn&Network=ovn&NetworkAccess=default&Platform=gcp&Platform=gcp&Procedure=none&Scheduler=default&SecurityMode=default&Suite=unknown&Suite=unknown&Topology=ha&Topology=ha&Upgrade=none&Upgrade=none&baseEndTime=2024-12-27%2023%3A59%3A59&baseRelease=4.18&baseStartTime=2025-01-17%2000%3A00%3A00&capability=Other&columnGroupBy=Architecture%2CNetwork%2CPlatform%2CTopology&component=Installer%20%2F%20openshift-installer&confidence=95&dbGroupBy=Platform%2CArchitecture%2CNetwork%2CTopology%2CFeatureSet%2CUpgrade%2CSuite%2CInstaller&environment=amd64%20default%20ipi%20ovn%20gcp%20unknown%20ha%20none&flakeAsFailure=0&ignoreDisruption=1&ignoreMissing=0&includeMultiReleaseAnalysis=1&includeVariant=Architecture%3Aamd64&includeVariant=CGroupMode%3Av2&includeVariant=ContainerRuntime%3Acrun&includeVariant=ContainerRuntime%3Arunc&includeVariant=FeatureSet%3Adefault&includeVariant=Installer%3Aipi&includeVariant=Installer%3Aupi&includeVariant=Network%3Aovn&includeVariant=Owner%3Aeng&includeVariant=Platform%3Aaws&includeVariant=Platform%3Aazure&includeVariant=Platform%3Agcp&includeVariant=Platform%3Ametal&includeVariant=Platform%3Avsphere&includeVariant=Topology%3Aha&includeVariant=Topology%3Amicroshift&minFail=3&passRateAllTests=0&passRateNewTests=95&pity=5&sampleEndTime=2025-01-24%2023%3A59%3A59&samplePRNumber=&samplePROrg=&samplePRRepo=&sampleRelease=4.18&sampleStartTime=2025-01-17%2000%3A00%3A00&testBasisRelease=4.17&testId=cluster%20install%3A3e14279ba2c202608dd9a041e5023c4c&testName=install%20should%20succeed%3A%20infrastructure&view=
+print("Slight Regression found from CR recently and then 3 failures out of 10 in a PR.")
+analyze_scenario(
+  alpha_prior = 0,
+  beta_prior = 418,
+  k_recent = 8,
+  n_recent = 106,
+  k_pr = 3,
+  n_pr = 10
+)
diff --git a/pkg/util/bayes/negative_binomial.go b/pkg/util/bayes/negative_binomial.go
@@ -0,0 +1,80 @@
+package main
+
+import (
+	"fmt"
+	"math"
+)
+
+// Calculate the posterior parameters
+func posteriorParams(alphaPrior, betaPrior, kRecent, nRecent float64) (float64, float64) {
+	alphaPosterior := alphaPrior + kRecent
+	betaPosterior := betaPrior + nRecent
+	return alphaPosterior, betaPosterior
+}
+
+// Factorial function
+func factorial(n int) float64 {
+	if n == 0 {
+		return 1
+	}
+	result := 1.0
+	for i := 1; i <= n; i++ {
+		result *= float64(i)
+	}
+	return result
+}
+
+// Binomial coefficient function
+func binomialCoefficient(n, k int) float64 {
+	if k > n {
+		return 0
+	}
+	return factorial(n) / (factorial(k) * factorial(n-k))
+}
+
+// Negative Binomial probability calculation
+func negativeBinomial(k int, size, prob float64) float64 {
+	chooseFactor := binomialCoefficient(k+int(size)-1, k)
+	return chooseFactor * math.Pow(prob, size) * math.Pow(1-prob, float64(k))
+}
+
+// Calculate the posterior predictive probability
+func posteriorPredictive(alphaPosterior, betaPosterior, k, n float64) float64 {
+	prob := betaPosterior / (betaPosterior + n)
+	return negativeBinomial(int(k), alphaPosterior, prob)
+}
+
+// Analyze a scenario
+func analyzeScenario(alphaPrior, betaPrior, kRecent, nRecent, kPr, nPr float64) {
+	// Step 1: Update posterior parameters
+	alphaPosterior, betaPosterior := posteriorParams(alphaPrior, betaPrior, kRecent, nRecent)
+
+	// Step 2: Calculate posterior predictive probability for pull request data
+	prob := posteriorPredictive(alphaPosterior, betaPosterior, kPr, nPr)
+
+	// Step 3: Report results
+	fmt.Println("--- Results ---")
+	fmt.Printf("Posterior Mean Failure Rate: %.4f\n", alphaPosterior/betaPosterior)
+	fmt.Printf("Posterior Predictive Probability of %.0f failures in %.0f tests: %.6f\n\n", kPr, nPr, prob)
+}
+
+func main() {
+	fmt.Println("Significant historical, limited mixed results in PR, possible ongoing issue")
+	analyzeScenario(10, 1000, 7, 27, 1, 2)
+
+	fmt.Println("Limited historical, limited mixed results in PR")
+	analyzeScenario(1, 30, 0, 5, 1, 2)
+
+	fmt.Println("Limited historical, unlikely regression in PR")
+	analyzeScenario(1, 30, 0, 20, 1, 10)
+
+	fmt.Println("Limited historical, obvious regression in PR")
+	analyzeScenario(1, 30, 0, 20, 10, 15)
+
+	fmt.Println("Strong high pass rate historical data, but this test is failing outside our PR in recent runs")
+	analyzeScenario(0, 1000, 20, 30, 1, 3)
+
+	fmt.Println("Slight Regression found from CR")
+	analyzeScenario(0, 418, 8, 106, 3, 10)
+}
+
diff --git a/pkg/util/bayes/pr_bayes_statistics_test.go b/pkg/util/bayes/pr_bayes_statistics_test.go
@@ -0,0 +1,135 @@
+package bayes
+
+import (
+	"testing"
+
+	log "github.com/sirupsen/logrus"
+	"gonum.org/v1/gonum/stat/distuv"
+)
+
+// BayesianSafetyCheck determines if a PR is safe to merge based on historical data,
+// other PR results, and environment-specific issues.
+func BayesianSafetyCheck(
+	historicalPasses, historicalFailures int, // simple historical data over the past x weeks
+	recentPasses, recentFailures int,         // recent data we'll weight more heavily to catch on-going incidents
+	prPasses, prFailures int,                 // results from this PR
+	thresholdDrop float64,                    // returns our confidence the test pass rate has dropped more than this amount
+) (float64, float64) {
+
+	// Laplace smoothing for historical data
+	smoothing := 1.0
+	alphaHistorical := float64(historicalPasses) + smoothing
+	betaHistorical := float64(historicalFailures) + smoothing
+
+	// Calculate recent-to-historical ratio
+	recentVolume := float64(recentPasses + recentFailures)
+	historicalVolume := float64(historicalPasses + historicalFailures)
+	volumeScale := recentVolume / (historicalVolume + 1.0) // Avoid division by zero
+
+	// Apply volume scaling to recent results, we want recent results to be considered far more significant
+	// than historical.
+	// Increase recent weight when failures are ongoing
+	recentWeightBoost := 1.0 + (float64(recentFailures)/(float64(recentPasses+1.0)))*2.0
+	dynamicWeight := (1.0 + volumeScale*2.0) * recentWeightBoost
+
+	alphaOtherPr := float64(recentPasses)*dynamicWeight + smoothing
+	betaOtherPr := float64(recentFailures)*dynamicWeight + smoothing
+
+	// Combine priors
+	alphaCombined := alphaHistorical + alphaOtherPr
+	betaCombined := betaHistorical + betaOtherPr
+
+	// New evidence weighting factor.
+	// This is tricky, if we have say 1000 runs in the historical data, no PR can generate enough information
+	// to possibly have the model think it could be a regression. We have to weight our PR samples to model
+	// our intuition. Because it can depend on the amount of historical data, we do this dynamically based on
+	// how much data we're up against.
+	/*
+		newEvidenceWeight := float64(historicalPasses+historicalFailures+recentFailures+recentFailures) / 20.0
+		if newEvidenceWeight < 1.0 {
+			newEvidenceWeight = 1.0 // Ensure a minimum weight
+		}
+
+	*/
+
+	// Dynamically limit PR contribution based on historical and recent data volume
+	prWeightLimit := (recentVolume + historicalVolume) / 10.0
+	prWeight := float64(prPasses+prFailures) / prWeightLimit
+	if prWeight < 1.0 {
+		prWeight = 1.0 // Minimum weight for PR evidence
+	}
+
+	// Adjust combined prior with PR results
+	alphaPosterior := alphaCombined + float64(prPasses)*prWeight
+	betaPosterior := betaCombined + float64(prFailures)*prWeight
+
+	log.Infof("alpha historical = %.1f, recent = %.1f, pr = %.1f",
+		alphaHistorical,
+		alphaOtherPr,
+		alphaPosterior)
+	log.Infof("beta historical = %.1f, recent = %.1f, pr = %.1f",
+		betaHistorical,
+		betaOtherPr,
+		betaPosterior)
+	// Define threshold for pass rate drop
+	historicalRate := float64(historicalPasses) / float64(historicalPasses+historicalFailures)
+	threshold := historicalRate - thresholdDrop
+
+	// Beta distribution for posterior
+	betaDist := distuv.Beta{Alpha: alphaPosterior, Beta: betaPosterior}
+
+	// Calculate probabilities
+	probRegression := betaDist.CDF(threshold)
+	probSafe := 1.0 - probRegression
+
+	log.Infof("Historical %d/%d, Recent Jobs %d/%d, This PR: %d/%d = Probability regression: %.3f, Probability safe: %.3f",
+		historicalPasses, historicalFailures+historicalPasses,
+		recentPasses, recentFailures+recentPasses,
+		prPasses, prFailures+prPasses, probRegression, probSafe)
+
+	return probSafe, probRegression
+}
+
+// Example usage
+func Test_PRSafetyCheck(t *testing.T) {
+	// passRateDrop is the drop in pass rate we're testing certainty for,
+	// i.e. how certain are we the tests pass rate has dropped this percentage if we merge this PR
+	passRateDrop := 0.05
+
+	log.Info("Significant historical, limited mixed results in PR, possible on-going issue")
+	BayesianSafetyCheck(
+		1000, 10,
+		20, 7,
+		1, 1,
+		passRateDrop)
+
+	log.Info("Limited historical, limited mixed results in PR")
+	BayesianSafetyCheck(
+		29, 1,
+		5, 0,
+		1, 1,
+		passRateDrop)
+
+	log.Info("Limited historical, unlikely regression in PR")
+	BayesianSafetyCheck(
+		29, 1,
+		20, 0,
+		9, 1,
+		passRateDrop)
+
+	log.Info("Limited historical, obvious regression in PR")
+	BayesianSafetyCheck(
+		29, 1,
+		20, 0,
+		5, 10,
+		passRateDrop)
+
+	// Now lets model strong high pass rate historical data, but this test is failing outside our PR in recent runs,
+	// be that other PRs or periodics:
+	log.Info("Stable test, on-going incident outside PR")
+	BayesianSafetyCheck(
+		1000, 0,
+		10, 20,
+		1, 2,
+		passRateDrop)
+}