Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Bayes test #2258

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ require (
go4.org v0.0.0-20201209231011-d4a079459e60 // indirect
gocloud.dev v0.40.0 // indirect
golang.org/x/crypto v0.26.0 // indirect
golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/net v0.28.0 // indirect
golang.org/x/sync v0.8.0 // indirect
Expand All @@ -172,6 +172,7 @@ require (
golang.org/x/tools v0.22.0 // indirect
golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
gonum.org/v1/gonum v0.15.1 // indirect
google.golang.org/genproto v0.0.0-20240812133136-8ffd90a71988 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240812133136-8ffd90a71988 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240812133136-8ffd90a71988 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,8 @@ golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EH
golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
Expand Down Expand Up @@ -1000,6 +1002,8 @@ gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o=
gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
Expand Down
94 changes: 94 additions & 0 deletions pkg/util/bayes/negative_binomial.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Notes:
# Rscript <filename> will run this. It only uses the standard library.

# To detect problems faster can:
# * We can use smaller priors so new data has a stronger influence.
# * Calculate posterior predictive distributions to detect deviations from what we expect.

# This is where we mix in the historical and recent data.
posterior_params <- function(alpha_prior, beta_prior, k_recent, n_recent) {
alpha_posterior <- alpha_prior + k_recent
beta_posterior <- beta_prior + n_recent
return(list(alpha = alpha_posterior, beta = beta_posterior))
}

posterior_predictive <- function(alpha_posterior, beta_posterior, k, n) {
prob <- dnbinom(x = k, size = alpha_posterior, prob = beta_posterior / (beta_posterior + n))
return(prob)
}

analyze_scenario <- function(alpha_prior, beta_prior, k_recent, n_recent, k_pr, n_pr) {
posterior <- posterior_params(alpha_prior, beta_prior, k_recent, n_recent)
prob <- posterior_predictive(posterior$alpha, posterior$beta, k_pr, n_pr)

cat("--- Results ---\n")
cat("Posterior Mean Failure Rate:", posterior$alpha / posterior$beta, "\n")
cat("Posterior Predictive Probability of", k_pr, "failures in", n_pr, "tests:", prob, "\n\n")
}

#alpha_prior = Historical failures
#beta_prior = Historical total tests
#k_recent = Recent failures
#n_recent = Recent total tests
#k_pr = Pull request failures
#n_pr = Pull request total tests
print("Significant historical, limited mixed results in PR, possible on-going issue")
analyze_scenario(
alpha_prior = 10,
beta_prior = 1000,
k_recent = 7,
n_recent = 27,
k_pr = 1,
n_pr = 2
)

print("Limited historical, limited mixed results in PR")
analyze_scenario(
alpha_prior = 1,
beta_prior = 30,
k_recent = 0,
n_recent = 5,
k_pr = 1,
n_pr = 2
)

print("Limited historical, unlikely regression in PR")
analyze_scenario(
alpha_prior = 1,
beta_prior = 30,
k_recent = 0,
n_recent = 20,
k_pr = 1,
n_pr = 10
)

print("Limited historical, obvious regression in PR")
analyze_scenario(
alpha_prior = 1,
beta_prior = 30,
k_recent = 0,
n_recent = 20,
k_pr = 10,
n_pr = 15
)

print("Strong high pass rate historical data, but this test is failing outside our PR in recent runs")
analyze_scenario(
alpha_prior = 0,
beta_prior = 1000,
k_recent = 20,
n_recent = 30,
k_pr = 1,
n_pr = 3
)

# https://sippy.dptools.openshift.org/sippy-ng/component_readiness/test_details?Aggregation=none&Architecture=amd64&Architecture=amd64&FeatureSet=default&FeatureSet=default&Installer=ipi&Installer=ipi&LayeredProduct=none&Network=ovn&Network=ovn&NetworkAccess=default&Platform=gcp&Platform=gcp&Procedure=none&Scheduler=default&SecurityMode=default&Suite=unknown&Suite=unknown&Topology=ha&Topology=ha&Upgrade=none&Upgrade=none&baseEndTime=2024-12-27%2023%3A59%3A59&baseRelease=4.18&baseStartTime=2025-01-17%2000%3A00%3A00&capability=Other&columnGroupBy=Architecture%2CNetwork%2CPlatform%2CTopology&component=Installer%20%2F%20openshift-installer&confidence=95&dbGroupBy=Platform%2CArchitecture%2CNetwork%2CTopology%2CFeatureSet%2CUpgrade%2CSuite%2CInstaller&environment=amd64%20default%20ipi%20ovn%20gcp%20unknown%20ha%20none&flakeAsFailure=0&ignoreDisruption=1&ignoreMissing=0&includeMultiReleaseAnalysis=1&includeVariant=Architecture%3Aamd64&includeVariant=CGroupMode%3Av2&includeVariant=ContainerRuntime%3Acrun&includeVariant=ContainerRuntime%3Arunc&includeVariant=FeatureSet%3Adefault&includeVariant=Installer%3Aipi&includeVariant=Installer%3Aupi&includeVariant=Network%3Aovn&includeVariant=Owner%3Aeng&includeVariant=Platform%3Aaws&includeVariant=Platform%3Aazure&includeVariant=Platform%3Agcp&includeVariant=Platform%3Ametal&includeVariant=Platform%3Avsphere&includeVariant=Topology%3Aha&includeVariant=Topology%3Amicroshift&minFail=3&passRateAllTests=0&passRateNewTests=95&pity=5&sampleEndTime=2025-01-24%2023%3A59%3A59&samplePRNumber=&samplePROrg=&samplePRRepo=&sampleRelease=4.18&sampleStartTime=2025-01-17%2000%3A00%3A00&testBasisRelease=4.17&testId=cluster%20install%3A3e14279ba2c202608dd9a041e5023c4c&testName=install%20should%20succeed%3A%20infrastructure&view=
print("Slight Regression found from CR recently and then 3 failures out of 10 in a PR.")
analyze_scenario(
alpha_prior = 0,
beta_prior = 418,
k_recent = 8,
n_recent = 106,
k_pr = 3,
n_pr = 10
)
80 changes: 80 additions & 0 deletions pkg/util/bayes/negative_binomial.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package main

import (
"fmt"
"math"
)

// Calculate the posterior parameters
func posteriorParams(alphaPrior, betaPrior, kRecent, nRecent float64) (float64, float64) {
alphaPosterior := alphaPrior + kRecent
betaPosterior := betaPrior + nRecent
return alphaPosterior, betaPosterior
}

// Factorial function
func factorial(n int) float64 {
if n == 0 {
return 1
}
result := 1.0
for i := 1; i <= n; i++ {
result *= float64(i)
}
return result
}

// Binomial coefficient function
func binomialCoefficient(n, k int) float64 {
if k > n {
return 0
}
return factorial(n) / (factorial(k) * factorial(n-k))
}

// Negative Binomial probability calculation
func negativeBinomial(k int, size, prob float64) float64 {
chooseFactor := binomialCoefficient(k+int(size)-1, k)
return chooseFactor * math.Pow(prob, size) * math.Pow(1-prob, float64(k))
}

// Calculate the posterior predictive probability
func posteriorPredictive(alphaPosterior, betaPosterior, k, n float64) float64 {
prob := betaPosterior / (betaPosterior + n)
return negativeBinomial(int(k), alphaPosterior, prob)
}

// Analyze a scenario
func analyzeScenario(alphaPrior, betaPrior, kRecent, nRecent, kPr, nPr float64) {
// Step 1: Update posterior parameters
alphaPosterior, betaPosterior := posteriorParams(alphaPrior, betaPrior, kRecent, nRecent)

// Step 2: Calculate posterior predictive probability for pull request data
prob := posteriorPredictive(alphaPosterior, betaPosterior, kPr, nPr)

// Step 3: Report results
fmt.Println("--- Results ---")
fmt.Printf("Posterior Mean Failure Rate: %.4f\n", alphaPosterior/betaPosterior)
fmt.Printf("Posterior Predictive Probability of %.0f failures in %.0f tests: %.6f\n\n", kPr, nPr, prob)
}

func main() {
fmt.Println("Significant historical, limited mixed results in PR, possible ongoing issue")
analyzeScenario(10, 1000, 7, 27, 1, 2)

fmt.Println("Limited historical, limited mixed results in PR")
analyzeScenario(1, 30, 0, 5, 1, 2)

fmt.Println("Limited historical, unlikely regression in PR")
analyzeScenario(1, 30, 0, 20, 1, 10)

fmt.Println("Limited historical, obvious regression in PR")
analyzeScenario(1, 30, 0, 20, 10, 15)

fmt.Println("Strong high pass rate historical data, but this test is failing outside our PR in recent runs")
analyzeScenario(0, 1000, 20, 30, 1, 3)

fmt.Println("Slight Regression found from CR")
analyzeScenario(0, 418, 8, 106, 3, 10)
}

135 changes: 135 additions & 0 deletions pkg/util/bayes/pr_bayes_statistics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package bayes

import (
"testing"

log "github.com/sirupsen/logrus"
"gonum.org/v1/gonum/stat/distuv"
)

// BayesianSafetyCheck determines if a PR is safe to merge based on historical data,
// other PR results, and environment-specific issues.
func BayesianSafetyCheck(
historicalPasses, historicalFailures int, // simple historical data over the past x weeks
recentPasses, recentFailures int, // recent data we'll weight more heavily to catch on-going incidents
prPasses, prFailures int, // results from this PR
thresholdDrop float64, // returns our confidence the test pass rate has dropped more than this amount
) (float64, float64) {

// Laplace smoothing for historical data
smoothing := 1.0
alphaHistorical := float64(historicalPasses) + smoothing
betaHistorical := float64(historicalFailures) + smoothing

// Calculate recent-to-historical ratio
recentVolume := float64(recentPasses + recentFailures)
historicalVolume := float64(historicalPasses + historicalFailures)
volumeScale := recentVolume / (historicalVolume + 1.0) // Avoid division by zero

// Apply volume scaling to recent results, we want recent results to be considered far more significant
// than historical.
// Increase recent weight when failures are ongoing
recentWeightBoost := 1.0 + (float64(recentFailures)/(float64(recentPasses+1.0)))*2.0
dynamicWeight := (1.0 + volumeScale*2.0) * recentWeightBoost

alphaOtherPr := float64(recentPasses)*dynamicWeight + smoothing
betaOtherPr := float64(recentFailures)*dynamicWeight + smoothing

// Combine priors
alphaCombined := alphaHistorical + alphaOtherPr
betaCombined := betaHistorical + betaOtherPr

// New evidence weighting factor.
// This is tricky, if we have say 1000 runs in the historical data, no PR can generate enough information
// to possibly have the model think it could be a regression. We have to weight our PR samples to model
// our intuition. Because it can depend on the amount of historical data, we do this dynamically based on
// how much data we're up against.
/*
newEvidenceWeight := float64(historicalPasses+historicalFailures+recentFailures+recentFailures) / 20.0
if newEvidenceWeight < 1.0 {
newEvidenceWeight = 1.0 // Ensure a minimum weight
}

*/

// Dynamically limit PR contribution based on historical and recent data volume
prWeightLimit := (recentVolume + historicalVolume) / 10.0
prWeight := float64(prPasses+prFailures) / prWeightLimit
if prWeight < 1.0 {
prWeight = 1.0 // Minimum weight for PR evidence
}

// Adjust combined prior with PR results
alphaPosterior := alphaCombined + float64(prPasses)*prWeight
betaPosterior := betaCombined + float64(prFailures)*prWeight

log.Infof("alpha historical = %.1f, recent = %.1f, pr = %.1f",
alphaHistorical,
alphaOtherPr,
alphaPosterior)
log.Infof("beta historical = %.1f, recent = %.1f, pr = %.1f",
betaHistorical,
betaOtherPr,
betaPosterior)
// Define threshold for pass rate drop
historicalRate := float64(historicalPasses) / float64(historicalPasses+historicalFailures)
threshold := historicalRate - thresholdDrop

// Beta distribution for posterior
betaDist := distuv.Beta{Alpha: alphaPosterior, Beta: betaPosterior}

// Calculate probabilities
probRegression := betaDist.CDF(threshold)
probSafe := 1.0 - probRegression

log.Infof("Historical %d/%d, Recent Jobs %d/%d, This PR: %d/%d = Probability regression: %.3f, Probability safe: %.3f",
historicalPasses, historicalFailures+historicalPasses,
recentPasses, recentFailures+recentPasses,
prPasses, prFailures+prPasses, probRegression, probSafe)

return probSafe, probRegression
}

// Example usage
func Test_PRSafetyCheck(t *testing.T) {
// passRateDrop is the drop in pass rate we're testing certainty for,
// i.e. how certain are we the tests pass rate has dropped this percentage if we merge this PR
passRateDrop := 0.05

log.Info("Significant historical, limited mixed results in PR, possible on-going issue")
BayesianSafetyCheck(
1000, 10,
20, 7,
1, 1,
passRateDrop)

log.Info("Limited historical, limited mixed results in PR")
BayesianSafetyCheck(
29, 1,
5, 0,
1, 1,
passRateDrop)

log.Info("Limited historical, unlikely regression in PR")
BayesianSafetyCheck(
29, 1,
20, 0,
9, 1,
passRateDrop)

log.Info("Limited historical, obvious regression in PR")
BayesianSafetyCheck(
29, 1,
20, 0,
5, 10,
passRateDrop)

// Now lets model strong high pass rate historical data, but this test is failing outside our PR in recent runs,
// be that other PRs or periodics:
log.Info("Stable test, on-going incident outside PR")
BayesianSafetyCheck(
1000, 0,
10, 20,
1, 2,
passRateDrop)
}
Loading