diff --git a/cmd/gh-aw/main.go b/cmd/gh-aw/main.go index 89f2c40f592..9ab2fbbca9c 100644 --- a/cmd/gh-aw/main.go +++ b/cmd/gh-aw/main.go @@ -768,6 +768,7 @@ Use "` + string(constants.CLIExtensionPrefix) + ` help all" to show help for all lintCmd := cli.NewLintCommand() domainsCmd := cli.NewDomainsCommand() experimentsCmd := cli.NewExperimentsCommand() + forecastCmd := cli.NewForecastCommand() // Assign commands to groups // Setup Commands @@ -802,6 +803,7 @@ Use "` + string(constants.CLIExtensionPrefix) + ` help all" to show help for all healthCmd.GroupID = "analysis" checksCmd.GroupID = "analysis" experimentsCmd.GroupID = "analysis" + forecastCmd.GroupID = "analysis" // Utilities mcpServerCmd.GroupID = "utilities" @@ -844,6 +846,7 @@ Use "` + string(constants.CLIExtensionPrefix) + ` help all" to show help for all rootCmd.AddCommand(projectCmd) rootCmd.AddCommand(domainsCmd) rootCmd.AddCommand(experimentsCmd) + rootCmd.AddCommand(forecastCmd) // Fix help flag descriptions for all subcommands to be consistent with the // root command ("Show help for gh aw" vs the Cobra default "help for [cmd]"). diff --git a/docs/adr/31377-monte-carlo-projection-for-forecast-command.md b/docs/adr/31377-monte-carlo-projection-for-forecast-command.md new file mode 100644 index 00000000000..232137497c7 --- /dev/null +++ b/docs/adr/31377-monte-carlo-projection-for-forecast-command.md @@ -0,0 +1,97 @@ +# ADR-31377: Monte Carlo Projection for `gh aw forecast` Command + +**Date**: 2026-05-10 +**Status**: Draft +**Deciders**: Unknown (PR authored by `app/copilot-swe-agent`; human deciders TBD) + +--- + +## Part 1 — Narrative (Human-Friendly) + +### Context + +Users of `gh-aw` want to project the future cost and yield of their agentic workflows before scheduling them at higher cadence or rolling them out organization-wide. Historical run data is highly variable: per-run effective token usage can vary by an order of magnitude depending on agent decisions, runs per period follow a counting process, and not every run succeeds. A naive point estimate (e.g. `avg(tokens) × avg(runs/period)`) hides this uncertainty and tends to under-state tail risk. The command must also integrate with existing analysis infrastructure (episode classification, A/B experiment variant tracking, JSON output for agent consumers) and remain useful on small samples (≤30 days of history). + +### Decision + +We will introduce a new **experimental** `gh aw forecast` CLI command that projects per-workflow effective token usage using **Monte Carlo simulation** (10 000 trials) rather than a single point estimate. Each trial composes three independent sources of uncertainty — Poisson-distributed run counts, bootstrap-resampled per-run effective tokens, and Bernoulli-distributed success — and the aggregated trials yield P10/P50/P90 confidence intervals. The command lives in `pkg/cli/forecast*.go`, reuses the existing `buildEpisodeData` engine from `logs_episode.go` for episode analysis, supports remote repositories via `--repo`, and is gated as experimental (stderr warning + `(experimental)` short description) because the interface and statistical assumptions may change. + +### Alternatives Considered + +#### Alternative 1: Point estimates from historical averages + +Compute `mean(effective_tokens) × mean(runs_per_period) × success_rate` and report a single projected number per workflow. Simple, deterministic, and cheap. Rejected because it hides variance, gives users no way to reason about tail risk (which is the operationally interesting question for cost budgeting), and makes side-by-side comparisons across workflows misleading when their variance profiles differ. + +#### Alternative 2: Closed-form analytical distribution (e.g. compound Poisson) + +Model run count as Poisson(λ) and per-run tokens as a parametric distribution (lognormal, gamma) and derive percentiles analytically. More elegant and faster than simulation. Rejected because the historical token distribution is typically multi-modal (different agent paths produce qualitatively different cost profiles) and ill-suited to a single parametric family; bootstrap resampling preserves the empirical shape without forcing a fit. Closed form also makes per-variant A/B splits and success-rate composition awkward. + +#### Alternative 3: Reuse the existing `audit` command and add a `--forecast` flag + +Extend the audit command instead of creating a new top-level command. Rejected because forecasting has a different mental model from auditing (forward projection vs. retrospective analysis), a different input shape (workflow IDs vs. run IDs), and different output structure (per-period projections vs. per-run metrics). Bundling them would muddy both commands' interfaces. + +### Consequences + +#### Positive +- Users get P10/P50/P90 intervals, exposing tail risk that point estimates would hide. +- Bootstrap resampling preserves the empirical token distribution without imposing a parametric model. +- JSON output (`monte_carlo` field) gives downstream agents structured access to the full distribution summary. +- Reuse of `buildEpisodeData` avoids duplicating episode-classification logic and keeps semantics consistent with `logs`/`audit`. +- Experimental gating lets us iterate on the statistical model (e.g. switching distributions, adjusting trial count) without a stability commitment. + +#### Negative +- Monte Carlo introduces nondeterminism in output — two consecutive runs on the same data produce slightly different P50/P10/P90 values unless a seed is pinned. This complicates regression testing and snapshot comparisons. +- 10 000 trials × N workflows × bootstrap sampling adds CPU cost; the Poisson sampler has two regimes (Knuth exact for λ ≤ 15, Normal approximation otherwise) to stay within ~10 ms/workflow, but this adds complexity vs. a closed-form approach. +- Episode counts for orchestrator-style workflows are a lower-bound estimate because `AwContext` (dispatch/workflow_call) lineage is unavailable without artifact downloads, which the command intentionally skips for speed. +- Remote-repo mode (`--repo`) degrades frontmatter metadata to empty since Markdown source is local-only, creating a subtle behavior split between local and remote forecasts. +- Adds three new files in `pkg/cli/` (forecast_command.go, forecast.go, forecast_montecarlo.go) plus tests, increasing maintenance surface in an already large package. + +#### Neutral +- The `--days` flag is capped at 30, which is a deliberate sampling-window choice; longer windows would require pagination changes in `gh run list`. +- The W3C-style specification at `docs/src/content/docs/reference/forecast-specification.md` (sidebar order 1355) commits us to keeping spec and implementation in sync while the command is experimental. +- Trial count (10 000) is currently hardcoded; making it configurable is a future option but not part of this decision. + +--- + +## Part 2 — Normative Specification (RFC 2119) + +> The key words **MUST**, **MUST NOT**, **REQUIRED**, **SHALL**, **SHALL NOT**, **SHOULD**, **SHOULD NOT**, **RECOMMENDED**, **MAY**, and **OPTIONAL** in this section are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119). + +### Projection Algorithm + +1. The `forecast` command **MUST** project per-workflow effective token usage using Monte Carlo simulation, not a single point estimate. +2. The simulation **MUST** run at least 10 000 independent trials per workflow per forecast invocation. +3. Each trial **MUST** compose three independent random variables: run count drawn from a Poisson process, per-run effective tokens drawn by bootstrap resampling of historical observations, and per-run success drawn as a Bernoulli with the historical success rate. +4. The Poisson sampler **MUST** use Knuth's exact algorithm when λ ≤ 15 and **MUST** use a Normal approximation when λ > 15. +5. The command **MUST** report P10, P50, and P90 effective-token percentiles in both the console table and JSON output. +6. The command **MUST NOT** emit only a point estimate without accompanying P10/P90 bounds. + +### Command Interface + +1. The command **MUST** be registered in the `analysis` command group as `gh aw forecast`. +2. The command **MUST** be marked experimental: its Cobra short description **MUST** include the literal substring `(experimental)`, and it **MUST** print an experimental warning to stderr at runtime. +3. The `--days` flag **MUST** accept only the values `7` and `30`; values outside this set **MUST** be rejected with a clear error. +4. The `--json` flag **MUST** emit the full `ForecastResult` struct including a `monte_carlo` object with `mean_projected_effective_tokens`, `std_dev_effective_tokens`, and P10/P50/P90 fields. +5. The command **MAY** accept multiple workflow IDs as positional arguments; when omitted, it **MUST** forecast all agentic workflows discoverable in the target repository. +6. When `--repo owner/repo` is supplied, workflow discovery **MUST** use the GitHub API (`fetchGitHubWorkflows`) and **MUST NOT** read local `.lock.yml` files for that invocation. +7. Workflow ID matching against remote repositories **MUST** be case-insensitive against both display names and file-path basenames. + +### Episode Analysis + +1. Episode grouping **MUST** reuse `buildEpisodeData` and `classifyEpisode` from `logs_episode.go`; it **MUST NOT** reimplement episode classification. +2. Because no artifacts are downloaded, episode linkage **MUST** rely only on GitHub Actions API fields (`event`, `headSha`, `headBranch`) and **MUST** gracefully degrade when `AwContext` is unavailable. +3. The console output **SHOULD** display an episode breakdown table only when `runs/episode > 1` (i.e. orchestrator-style workflows). + +### Frontmatter and Variants + +1. When forecasting local workflows, the command **MUST** surface active trigger types and concurrency configuration from each workflow's Markdown frontmatter. +2. When forecasting via `--repo`, frontmatter-derived fields **MAY** be empty without causing the forecast to fail. +3. When a workflow defines A/B experiment variants, run counts and fractions **MUST** be reported per variant in both console and JSON output. + +### Conformance + +An implementation is considered conformant with this ADR if it satisfies all **MUST** and **MUST NOT** requirements above. Failure to meet any **MUST** or **MUST NOT** requirement constitutes non-conformance. + +--- + +*This is a DRAFT ADR generated by the [Design Decision Gate](https://github.com/github/gh-aw/actions/runs/25642964043) workflow. The PR author must review, complete, and finalize this document before the PR can merge.* diff --git a/docs/src/content/docs/reference/forecast-specification.md b/docs/src/content/docs/reference/forecast-specification.md new file mode 100644 index 00000000000..c40a53c2b37 --- /dev/null +++ b/docs/src/content/docs/reference/forecast-specification.md @@ -0,0 +1,973 @@ +--- +title: Forecast Command Specification +description: Formal W3C-style specification for the gh aw forecast command — Monte Carlo token-usage projection, episode analysis, workflow discovery, and output formats for GitHub Agentic Workflows +sidebar: + order: 1355 +--- + +# Forecast Command Specification + +**Version**: 0.1.0 +**Status**: Experimental Draft +**Latest Version**: [forecast-specification](/gh-aw/reference/forecast-specification/) +**Editor**: GitHub Agentic Workflows Team + +> ⚠️ **Experimental**: This specification describes a feature that is under active development. The command interface, output schema, and algorithmic parameters are subject to change without notice. Do not depend on this interface in production workflows. + +--- + +## Abstract + +This specification defines the `gh aw forecast` command for the GitHub Agentic Workflows (gh-aw) project. The command performs historical sampling of completed agentic workflow runs and applies a Monte Carlo simulation engine to project future Effective Token (ET) consumption over a configurable time horizon. The specification covers workflow discovery (local and remote modes), data sampling via the GitHub Actions API, the Poisson–bootstrap Monte Carlo projection algorithm, episode-level analysis, and both console-table and machine-readable JSON output formats. Implementations conforming to this specification provide operators with probabilistic token-consumption forecasts suitable for capacity planning, cost estimation, and budget governance. + +--- + +## Status of This Document + +This section describes the status of this document at the time of publication. This is an **Experimental Draft** specification and may be updated, replaced, or made obsolete by other documents at any time. The feature it describes is experimental and not yet subject to the stability guarantees that apply to other gh-aw commands. + +This document is governed by the GitHub Agentic Workflows project specifications process. + +Feedback should be filed as GitHub issues against the `github/gh-aw` repository. + +--- + +## Table of Contents + +1. [Introduction](#1-introduction) +2. [Conformance](#2-conformance) +3. [Terminology](#3-terminology) +4. [Command Interface](#4-command-interface) +5. [Workflow Discovery](#5-workflow-discovery) +6. [Data Sampling](#6-data-sampling) +7. [Monte Carlo Projection Engine](#7-monte-carlo-projection-engine) +8. [Episode Analysis](#8-episode-analysis) +9. [Output Formats](#9-output-formats) +10. [Error Handling](#10-error-handling) +11. [Implementation Requirements](#11-implementation-requirements) +12. [Compliance Testing](#12-compliance-testing) +13. [Appendices](#appendices) +14. [References](#references) +15. [Change Log](#change-log) + +--- + +## 1. Introduction + +### 1.1 Purpose + +The `gh aw forecast` command addresses the operational need to predict future Large Language Model (LLM) token expenditure for agentic workflows managed by gh-aw. Token consumption is a primary cost driver for agentic systems; the ability to project future usage from historical observations enables: + +- **Capacity Planning**: Anticipating token demand before budget thresholds are reached. +- **Cost Governance**: Providing P10/P50/P90 confidence intervals for financial planning. +- **Workflow Comparison**: Ranking workflows by projected token cost across a shared time period. +- **Experiment Evaluation**: Measuring the token impact of A/B experiment variants. + +The command combines empirical bootstrapping of historical token observations with a Poisson-distributed run-count model to produce statistically sound projections without requiring parametric distribution assumptions on token usage. + +### 1.2 Scope + +This specification covers: + +- Command-line interface: flags, positional arguments, and invocation modes +- Workflow discovery in local (`.github/workflows/`) and remote (`--repo`) modes +- Historical run sampling and per-run metric derivation +- The Monte Carlo simulation algorithm producing P10, P50, P90 percentile estimates +- Episode grouping and episode-level metric computation +- Console table output format +- Machine-readable JSON output schema (`--json`) +- Error conditions and graceful-degradation behavior + +This specification does NOT cover: + +- The Effective Tokens (ET) computation algorithm (defined in the [Effective Tokens Specification](/gh-aw/reference/effective-tokens-specification/)) +- The `aw_info.json` artifact schema +- A/B experiment frontmatter schema (defined in the [A/B Experiments Specification](/gh-aw/reference/experiments-specification/)) +- Billing, pricing, or financial modeling beyond token projections +- Streaming or real-time token consumption reporting + +### 1.3 Design Goals + +A conforming `gh aw forecast` implementation MUST be designed for: + +- **Empirical Accuracy**: Projections derived from observed historical data rather than assumed distributions. +- **Probabilistic Reporting**: P10/P50/P90 uncertainty bounds communicated to callers. +- **Graceful Degradation**: Missing data (no runs, no artifacts, no frontmatter) MUST produce partial results rather than failures. +- **Dual Modes**: Both local-repository and remote-repository operation without requiring a checkout. +- **Interoperability**: JSON output schema stable enough for machine consumption by downstream tooling. + +--- + +## 2. Conformance + +### 2.1 Conformance Classes + +A **conforming forecast implementation** is one that satisfies all MUST, REQUIRED, and SHALL requirements in this specification. + +A **partially conforming forecast implementation** is one that satisfies all MUST requirements in Sections 4, 5, 6, and 7 but MAY lack support for optional features such as episode analysis (Section 8), experiment variant reporting, or verbose diagnostics. + +### 2.2 Requirements Notation + +The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://www.ietf.org/rfc/rfc2119.txt). + +### 2.3 Compliance Levels + +Implementations MUST support: + +- **Level 1 (Required)**: Command invocation, workflow discovery, historical data sampling, and Monte Carlo projection with console output. +- **Level 2 (Standard)**: JSON output (`--json`), episode analysis, remote-repository mode (`--repo`), and experiment variant reporting. +- **Level 3 (Complete)**: All optional features including `--verbose` diagnostics, concurrency limit reporting, and frontmatter metadata enrichment. + +--- + +## 3. Terminology + +### 3.1 Effective Tokens (ET) + +A normalized unit of LLM token consumption defined in the [Effective Tokens Specification](/gh-aw/reference/effective-tokens-specification/). ET accounts for token class weights and model multipliers to produce a single comparable scalar across heterogeneous LLM invocations. + +### 3.2 Workflow Run + +A single execution of a GitHub Actions workflow. A run has a unique numeric run ID, an event type, a status (`completed`, `in_progress`, `queued`), a conclusion (`success`, `failure`, `cancelled`, etc.), and a head commit SHA. + +### 3.3 Historical Window + +The time interval `[now − days, now]` used to bound the set of completed runs eligible for sampling. Controlled by the `--days` flag. + +### 3.4 Sample + +The subset of completed workflow runs within the historical window selected for metric derivation. The maximum sample size per workflow is controlled by the `--sample` flag. + +### 3.5 Monte Carlo Trial + +A single independent simulation that draws stochastic values for run count, per-run token usage, and per-run success, combining them to produce one projected Effective Token total for the projection period. + +### 3.6 Projection Period + +The future time interval for which token consumption is projected. Controlled by the `--period` flag; either one calendar week (`week`) or one calendar month (`month`). + +### 3.7 Observed Runs Per Period + +The rate of workflow runs observed in the historical window, extrapolated to the projection period length: + +``` +observed_runs_per_period = (sampled_run_count / history_days) × period_days +``` + +Where `period_days` is 7 for `week` and 30 for `month`. + +### 3.8 Episode + +A logical grouping of one or more workflow runs that collectively represent a single task attempt. Episodes are identified by grouping runs sharing the same `headSha` and `headBranch`, or by `workflow_dispatch`/`workflow_call` linkage where available. + +### 3.9 Yield + +The effective throughput rate: the expected number of successful runs per projection period, computed as the product of the observed run frequency and the historical success rate: + +``` +yield = observed_runs_per_period × success_rate +``` + +Where `success_rate = successful_run_count / total_sampled_run_count`. + +### 3.10 Bootstrap Resampling + +An empirical resampling technique where individual observations are drawn with replacement from the observed sample. Used in Section 7 to model per-run token usage without parametric distribution assumptions. + +### 3.11 Lock File + +A `.lock.yml` file located in `.github/workflows/` that declares a compiled agentic workflow and its associated metadata. Lock files are the authoritative source of workflow identifiers in local mode. + +--- + +## 4. Command Interface + +### 4.1 Synopsis + +``` +gh aw forecast [workflow_id...] [flags] +``` + +### 4.2 Positional Arguments + +| Argument | Type | Required | Description | +|---|---|---|---| +| `workflow_id` | string (repeatable) | No | Zero or more workflow identifiers to forecast. If omitted, all discovered agentic workflows are forecasted. | + +Workflow identifiers MUST be matched case-insensitively against: +1. The workflow display name +2. The workflow file-path basename (without extension) + +If a provided `workflow_id` does not match any discovered workflow, the implementation MUST emit an error message identifying the unmatched identifier and MUST exit with a non-zero status code. + +### 4.3 Flags + +| Flag | Type | Default | Description | +|---|---|---|---| +| `--days` | int | `30` | Length of the historical sampling window in days. Permitted values: `7`, `30`. | +| `--period` | string | `"month"` | Projection period length. Permitted values: `"week"`, `"month"`. | +| `--sample` | int | `100` | Maximum number of completed runs to sample per workflow. MUST be ≥ 1. | +| `--repo` | string | (none) | Target a repository other than the current working directory, in `owner/repo` format. Enables remote mode. | +| `--json` | bool | `false` | Emit machine-readable JSON output instead of console tables. | +| `--verbose` | bool | `false` | Emit verbose diagnostic output to stderr during processing. | + +### 4.4 Flag Validation + +Implementations MUST validate all flag values before beginning any API calls or file system operations: + +- **R-CLI-001**: If `--days` is not one of `{7, 30}`, the implementation MUST exit with a non-zero status and an error message specifying the permitted values. +- **R-CLI-002**: If `--period` is not one of `{"week", "month"}`, the implementation MUST exit with a non-zero status and an error message specifying the permitted values. +- **R-CLI-003**: If `--sample` is less than 1, the implementation MUST exit with a non-zero status. +- **R-CLI-004**: If `--repo` is provided, it MUST match the pattern `owner/repo` (two non-empty components separated by `/`). An invalid format MUST produce a non-zero exit with a descriptive error. + +### 4.5 Exit Codes + +| Code | Meaning | +|---|---| +| `0` | Forecast completed successfully. | +| `1` | Usage error (invalid flags, unmatched workflow IDs). | +| `2` | GitHub API authentication failure. | +| `3` | No workflows discovered. | + +### 4.6 Example Invocations + +```sh +# Forecast all agentic workflows in the current repository for the next month +gh aw forecast + +# Forecast two specific workflows and compare +gh aw forecast ci-doctor daily-planner + +# Use a 7-day window and project over the next week +gh aw forecast --period week --days 7 + +# Emit machine-readable JSON +gh aw forecast --json + +# Forecast workflows in a remote repository +gh aw forecast --repo owner/repo + +# Forecast a specific workflow in a remote repository +gh aw forecast --repo owner/repo ci-doctor +``` + +--- + +## 5. Workflow Discovery + +### 5.1 Modes + +The forecast command operates in one of two discovery modes, determined by the presence of the `--repo` flag: + +- **Local Mode**: `--repo` is absent; workflows are discovered from the current repository's `.github/workflows/` directory. +- **Remote Mode**: `--repo` is present; workflows are discovered via the GitHub Actions API. + +### 5.2 Local Mode Discovery + +In local mode, the implementation MUST: + +1. **R-DISC-001**: Enumerate all files matching `*.lock.yml` within `.github/workflows/` of the current working repository. +2. **R-DISC-002**: Parse each lock file to extract the workflow identifier and display name. +3. **R-DISC-003**: If the `.github/workflows/` directory does not exist or contains no lock files, the implementation MUST emit an informational message and exit with code `3`. + +The implementation MAY additionally read frontmatter metadata from corresponding workflow source files to enrich per-workflow records with: + +- Active trigger types (`active_triggers`) +- Concurrency configuration (`concurrency_limit`) +- A/B experiment variant declarations (`experiment_variants`) + +Frontmatter enrichment is OPTIONAL; absence of a corresponding source file MUST NOT prevent discovery or projection of the workflow. + +### 5.3 Remote Mode Discovery + +In remote mode (when `--repo owner/repo` is specified), the implementation MUST: + +1. **R-DISC-010**: Call the GitHub Actions API (`GET /repos/{owner}/{repo}/actions/workflows`) to enumerate workflows in the target repository. +2. **R-DISC-011**: Filter the returned workflows to those identified as agentic (e.g., by inspecting file-path conventions, labels, or other implementation-defined heuristics). +3. **R-DISC-012**: Match any caller-supplied `workflow_id` positional arguments against workflow display names and file-path basenames using case-insensitive string comparison. + +In remote mode, frontmatter metadata (triggers, concurrency, experiment variants) is UNAVAILABLE because the workflow source files are not accessible. The implementation MUST degrade gracefully: fields that depend on frontmatter MUST be omitted from output or reported as their zero/empty values rather than causing an error. + +### 5.4 Workflow ID Matching + +Workflow ID matching MUST be case-insensitive. A caller-supplied identifier matches a discovered workflow if and only if it equals (ignoring case) either: + +- The workflow's display name, OR +- The basename of the workflow's file path (without file extension) + +Matching MUST be performed after discovery is complete; partial prefix matches are NOT sufficient for conformance. + +--- + +## 6. Data Sampling + +### 6.1 Sampling Procedure + +For each discovered workflow (or each workflow in the filtered set), the implementation MUST perform the following sampling procedure: + +1. **R-SAMP-001**: Query completed workflow runs within the historical window using the equivalent of `gh run list --workflow --status completed --limit --created >=`. +2. **R-SAMP-002**: Limit the returned run set to at most `--sample` runs. +3. **R-SAMP-003**: For each run in the sample, derive the per-run metrics defined in Section 6.2. +4. **R-SAMP-004**: Record the count of runs with a successful conclusion separately from the total sampled count. + +If the historical window yields zero completed runs for a workflow, the implementation MUST: + +- **R-SAMP-005**: Return `nil` (or a sentinel empty result) for that workflow's Monte Carlo projection. +- **R-SAMP-006**: Include the workflow in output with `sampled_runs: 0` and all projection fields set to zero. +- **R-SAMP-007**: SHOULD emit a warning indicating that no historical data is available for the workflow. + +### 6.2 Per-Run Metric Derivation + +For each sampled run, the implementation MUST derive: + +| Metric | Source | Description | +|---|---|---| +| `effective_tokens` | `aw_info.json` artifact | Total ET for this run as defined in the Effective Tokens Specification. | +| `duration_seconds` | Run start/end timestamps | Wall-clock duration of the run in seconds. | +| `success` | Run conclusion field | `true` if conclusion is `"success"`, `false` otherwise. | + +#### 6.2.1 Effective Token Retrieval + +Effective token counts are obtained from locally-cached run summaries when available. The `gh aw logs` command stores a `run_summary.json` file for each processed run under `{output_dir}/run-{run_id}/`. During forecasting the implementation: + +- **R-SAMP-010**: MUST attempt to load the cached `run_summary.json` for each sampled run using the default logs output directory (`.github/aw/logs`). +- **R-SAMP-011**: MUST extract the `TotalEffectiveTokens` field from the cached `TokenUsage` summary when present. +- **R-SAMP-012**: If no cached summary exists or the ET field is zero, the run's ET contribution MUST be treated as zero and the run MUST still be counted in `sampled_runs`. The implementation SHOULD log a debug-level warning. + +This lightweight approach avoids re-downloading artifacts while still providing accurate ET observations for runs that have already been processed locally by `gh aw logs`. + +#### 6.2.2 Duration Derivation + +Duration MUST be computed as: + +``` +duration_seconds = run.updated_at − run.started_at +``` + +Both timestamps MUST be sourced from the GitHub Actions API run object. If either timestamp is zero or unavailable, the run's duration contribution SHOULD be treated as zero. + +### 6.3 Observed Rate Computation + +After sampling, the implementation MUST compute: + +``` +observed_runs_per_period = (sampled_run_count / history_days) × period_days +``` + +Where: +- `history_days` is the value of `--days` +- `period_days` is `7` for `"week"` and `30` for `"month"` + +--- + +## 7. Monte Carlo Projection Engine + +### 7.1 Overview + +The Monte Carlo engine runs **10,000 independent simulation trials** per workflow to produce a probability distribution over projected Effective Token consumption in the next projection period. The engine models three independent sources of uncertainty per trial. + +Implementations MUST use exactly 10,000 trials. The trial count is a normative requirement to ensure consistency of P10/P50/P90 estimates across implementations. + +### 7.2 Uncertainty Sources + +Each trial draws independently from three stochastic components: + +#### 7.2.1 Run Count (Poisson Model) + +The number of runs in the projection period is modeled as a Poisson random variable with rate parameter: + +``` +λ = observed_runs_per_period +``` + +The implementation MUST use: + +- **Knuth's exact algorithm** when `λ ≤ 15`: + + ``` + L ← e^(−λ) + k ← 0; p ← 1 + repeat: + k ← k + 1 + p ← p × Uniform(0, 1) + until p ≤ L + return k − 1 + ``` + +- **Normal approximation** when `λ > 15`: + + ``` + k ← round(Normal(μ=λ, σ=sqrt(λ))) + k ← max(0, k) + ``` + +- **R-MC-001**: For `λ = 0`, the implementation MUST return a projected token total of 0 for that trial without invoking either algorithm. + +#### 7.2.2 Per-Run Token Usage (Bootstrap Resampling) + +Token usage per run is modeled empirically using bootstrap resampling: + +- **R-MC-010**: For each run in a trial, the implementation MUST draw one observation uniformly at random **with replacement** from the set of historical ET observations in the sample. +- **R-MC-011**: If the sample contains zero ET observations (all runs had missing artifacts), the per-run token draw MUST return 0. + +This non-parametric approach preserves the empirical distribution of token usage, including multi-modal distributions and heavy tails, without imposing a parametric form. + +#### 7.2.3 Per-Run Success (Bernoulli Model) + +Whether a given run in the trial succeeds is modeled as a Bernoulli draw: + +``` +P(success) = success_rate = successful_run_count / total_sampled_run_count +``` + +- **R-MC-020**: Each run in a trial MUST independently draw from `Bernoulli(success_rate)`. +- **R-MC-021**: Only successful runs contribute their token draw to the trial's projected total. Failed runs contribute zero tokens to the projection. +- **R-MC-022**: If `total_sampled_run_count = 0`, `success_rate` MUST be treated as 0. The implementation MUST return a zero projection for all trials. + +### 7.3 Trial Aggregation + +For a given trial with `k` drawn runs: + +``` +trial_tokens = Σ_{i=1}^{k} (success_i × token_draw_i) +``` + +Where: +- `success_i` is `1` if the Bernoulli draw for run `i` succeeds, `0` otherwise +- `token_draw_i` is the bootstrapped ET observation for run `i` + +### 7.4 Output Statistics + +After completing all 10,000 trials, the implementation MUST compute and report: + +| Statistic | Definition | +|---|---| +| `mean_projected_effective_tokens` | Arithmetic mean of all trial totals | +| `std_dev_effective_tokens` | Population or sample standard deviation of all trial totals | +| `p10_projected_effective_tokens` | 10th percentile of trial totals (lower bound of 80% CI) | +| `p50_projected_effective_tokens` | 50th percentile of trial totals (median projection) | +| `p90_projected_effective_tokens` | 90th percentile of trial totals (upper bound of 80% CI) | + +Percentile computation MUST use the nearest-rank method or an equivalent method that produces results consistent with a 10,000-element sorted array. + +The `projected_effective_tokens` top-level field MUST equal `p50_projected_effective_tokens`. + +### 7.5 Nil Projection Condition + +If no historical runs are available for a workflow, the implementation MUST return a nil (empty/zero) projection for that workflow. Nil projections MUST be represented in JSON output as zero values for all numeric Monte Carlo fields. The implementation MUST NOT run trials when the sample is empty. + +--- + +## 8. Episode Analysis + +### 8.1 Purpose + +An **episode** is a logical grouping of one or more workflow runs that collectively represent a single task attempt. Episode analysis computes per-episode metrics to reveal how many runs, on average, are required to complete a task successfully. + +### 8.2 Episode Construction + +The implementation MUST group sampled runs into episodes using the `buildEpisodeData` and `classifyEpisode` engine: + +- **R-EP-001**: Runs sharing the same `headSha` and `headBranch` MUST be grouped into the same episode. +- **R-EP-002**: Runs linked by `workflow_dispatch` or `workflow_call` relationships (reconstructed from cached run summaries) SHOULD be merged into the triggering run's episode. + +#### 8.2.1 Limitations in Forecast Context + +During forecasting, full artifact data may not be available for all sampled runs. When cached summary data is unavailable: + +- **R-EP-010**: `workflow_dispatch`/`workflow_call` linkage MUST be omitted from episode construction. +- **R-EP-011**: The resulting `sampled_episodes` count MUST be treated as a **lower-bound estimate**. Implementations MUST communicate this limitation in output (e.g., via a note in console output or a boolean `episode_count_is_lower_bound` field in JSON). + +For orchestrator workflows that primarily receive `workflow_call` triggers, the episode count underestimate may be significant. Implementations SHOULD emit a warning when the dominant trigger type is `workflow_call` or `workflow_dispatch`. + +### 8.3 Episode Metrics + +For each workflow, the implementation MUST compute: + +| Metric | Definition | +|---|---| +| `sampled_episodes` | Count of distinct episodes identified in the sample | +| `runs_per_episode` | `sampled_run_count / sampled_episodes` | +| `avg_effective_tokens_per_episode` | Mean ET summed across all runs within each episode | +| `observed_episodes_per_period` | `(sampled_episodes / history_days) × period_days` | + +### 8.4 Episode Table Display + +The implementation MUST display the episode analysis table in console output when any workflow in the result set has `runs_per_episode > 1.0`. The table SHOULD be omitted when all workflows have `runs_per_episode = 1.0` (one run per episode is the baseline and adds no additional information). + +--- + +## 9. Output Formats + +### 9.1 Console Table Output + +When `--json` is not specified, the implementation MUST render a formatted console table to stderr with the following columns: + +| Column | Description | +|---|---| +| `Workflow` | Workflow display name or identifier | +| `Sampled Runs` | Count of completed runs included in the sample | +| `Success Rate` | Fraction of sampled runs concluding with `success`, formatted as a percentage; `N/A` when no runs were sampled | +| `Yield/Period` | Effective throughput rate (`success_rate × observed_runs_per_period`) formatted to one decimal place | +| `Avg ET` | `avg_effective_tokens` formatted as K/M abbreviations (e.g. `12.5K`, `1.20M`); `-` when zero | +| `Proj. ET (P50)` | Median projected effective tokens from Monte Carlo (P50), formatted as K/M abbreviations | +| `80% CI (P10–P90)` | Confidence interval range `p10–p90`, both formatted as K/M abbreviations | +| `Triggers` | Comma-separated list of active trigger event names from frontmatter (up to 3, remainder shown as `+N`) | + +#### 9.1.1 Table Formatting Requirements + +- **R-OUT-001**: Column widths MUST be auto-fitted to the widest value in each column. +- **R-OUT-002**: ET values MUST be formatted as K/M abbreviations (e.g. `12.5K`, `1.20M`); raw integer values of zero MUST be rendered as `-`. +- **R-OUT-003**: Rows MUST be sorted by Monte Carlo P50 projected effective tokens in descending order; when Monte Carlo data is unavailable, sort by `projected_effective_tokens`. +- **R-OUT-004**: A workflow with zero sampled runs MUST appear in the table with `-` in projection columns and `N/A` in rate columns. +- **R-OUT-005**: When episode analysis is applicable (Section 8.4), a second table with episode metrics MUST be printed below the main table, separated by a blank line. + +#### 9.1.2 Example Console Output + +``` +Workflow Sampled Runs Success Rate Yield/Period Avg ET Proj. ET (P50) 80% CI (P10–P90) Triggers +ci-doctor 42 92% 35.4 12.5K 480.0K 430.0K–535.0K pull_request, workflow_dispatch +daily-planner 18 89% 14.4 8.2K 131.0K 105.0K–158.0K schedule +``` + +### 9.2 JSON Output Schema + +When `--json` is specified, the implementation MUST emit a single JSON object to stdout conforming to the following schema. No additional content (banners, progress indicators, or table output) MUST be emitted to stdout. Diagnostic messages MAY be emitted to stderr. + +#### 9.2.1 Root Object + +```json +{ + "period": "", + "as_of": "", + "workflows": [ , ... ] +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `period` | string | MUST | Projection period: `"week"` or `"month"`. | +| `as_of` | string | MUST | ISO 8601 / RFC 3339 UTC timestamp at which the forecast was computed. | +| `workflows` | array | MUST | Ordered array of per-workflow forecast objects. MUST be sorted by `projected_effective_tokens` (P50) descending. | + +#### 9.2.2 WorkflowForecast Object + +```json +{ + "workflow_id": "", + "period": "", + "sampled_runs": , + "history_days": , + "observed_runs_per_period": , + "success_rate": , + "yield": , + "avg_effective_tokens": , + "avg_duration_seconds": , + "projected_effective_tokens": , + "active_triggers": [ "", ... ], + "concurrency_limit": , + "monte_carlo": { }, + "episode_analysis": { }, + "experiment_variants": [ , ... ] +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `workflow_id` | string | MUST | Workflow identifier as used in discovery. | +| `period` | string | MUST | Mirrors the root `period` field. | +| `sampled_runs` | integer | MUST | Number of runs included in the sample. | +| `history_days` | integer | MUST | Value of `--days` used for this forecast. | +| `observed_runs_per_period` | number | MUST | Extrapolated run rate for the projection period. | +| `success_rate` | number | MUST | Fraction of sampled runs that concluded successfully, in `[0.0, 1.0]`. | +| `yield` | number | MUST | Effective throughput rate: `success_rate × observed_runs_per_period`. | +| `avg_effective_tokens` | number | MUST | Mean ET per sampled run. `0` when no ET data is available. | +| `avg_duration_seconds` | number | MUST | Mean wall-clock duration per sampled run in seconds. | +| `projected_effective_tokens` | number | MUST | P50 Monte Carlo projection. Equals `monte_carlo.p50_projected_effective_tokens`. | +| `active_triggers` | array of strings | SHOULD | Trigger event types from workflow frontmatter. Empty array when frontmatter is unavailable. | +| `concurrency_limit` | integer | SHOULD | Concurrency group limit from frontmatter. `0` indicates unlimited or unavailable. | +| `monte_carlo` | object | MUST | Monte Carlo simulation results. See Section 9.2.3. | +| `episode_analysis` | object | SHOULD | Episode analysis results. See Section 9.2.4. | +| `experiment_variants` | array | MAY | A/B experiment variant breakdown. See Section 9.2.5. Empty array when frontmatter is unavailable or no experiments are configured. | + +#### 9.2.3 MonteCarlo Object + +```json +{ + "iterations": 10000, + "mean_projected_effective_tokens": , + "std_dev_effective_tokens": , + "p10_projected_effective_tokens": , + "p50_projected_effective_tokens": , + "p90_projected_effective_tokens": +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `iterations` | integer | MUST | Always `10000`. | +| `mean_projected_effective_tokens` | number | MUST | Arithmetic mean of trial totals. | +| `std_dev_effective_tokens` | number | MUST | Standard deviation of trial totals. | +| `p10_projected_effective_tokens` | number | MUST | 10th percentile of trial totals. | +| `p50_projected_effective_tokens` | number | MUST | 50th percentile (median) of trial totals. | +| `p90_projected_effective_tokens` | number | MUST | 90th percentile of trial totals. | + +When `sampled_runs = 0`, all numeric fields in this object MUST be `0` and `iterations` MUST be `0`. + +#### 9.2.4 EpisodeAnalysis Object + +```json +{ + "sampled_episodes": , + "runs_per_episode": , + "avg_effective_tokens_per_episode": , + "observed_episodes_per_period": +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `sampled_episodes` | integer | MUST | Distinct episode count. Lower-bound estimate when artifact linkage is unavailable. | +| `runs_per_episode` | number | MUST | Mean runs per episode. | +| `avg_effective_tokens_per_episode` | number | MUST | Mean ET per episode. | +| `observed_episodes_per_period` | number | MUST | Extrapolated episode rate for the projection period. | + +#### 9.2.5 ExperimentVariant Object + +```json +{ + "experiment_name": "", + "variant": "", + "run_count": , + "fraction": +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `experiment_name` | string | MUST | Name of the A/B experiment from frontmatter. | +| `variant` | string | MUST | Variant identifier (e.g., `"control"`, `"treatment"`). | +| `run_count` | integer | MUST | Number of sampled runs assigned to this variant. | +| `fraction` | number | MUST | `run_count / sampled_runs` for this workflow; fraction in `[0.0, 1.0]`. | + +#### 9.2.6 Complete JSON Example + +```json +{ + "period": "month", + "as_of": "2026-05-10T22:00:00Z", + "workflows": [ + { + "workflow_id": "ci-doctor", + "period": "month", + "sampled_runs": 42, + "history_days": 30, + "observed_runs_per_period": 38.5, + "success_rate": 0.92, + "yield": 0.92, + "avg_effective_tokens": 12500, + "avg_duration_seconds": 145.3, + "projected_effective_tokens": 480000, + "active_triggers": ["pull_request", "workflow_dispatch"], + "concurrency_limit": 0, + "monte_carlo": { + "iterations": 10000, + "mean_projected_effective_tokens": 481250, + "std_dev_effective_tokens": 32000.5, + "p10_projected_effective_tokens": 430000, + "p50_projected_effective_tokens": 480000, + "p90_projected_effective_tokens": 535000 + }, + "episode_analysis": { + "sampled_episodes": 40, + "runs_per_episode": 1.05, + "avg_effective_tokens_per_episode": 13100, + "observed_episodes_per_period": 36.7 + }, + "experiment_variants": [ + { + "experiment_name": "model-selection", + "variant": "control", + "run_count": 21, + "fraction": 0.5 + }, + { + "experiment_name": "model-selection", + "variant": "treatment", + "run_count": 21, + "fraction": 0.5 + } + ] + } + ] +} +``` + +### 9.3 Output Ordering + +- **R-OUT-010**: In both console and JSON output, workflows MUST be ordered by `projected_effective_tokens` (P50 value) in descending order. +- **R-OUT-011**: Workflows with zero projected tokens MUST appear after all workflows with non-zero projections. +- **R-OUT-012**: Among workflows with equal projected tokens, the ordering SHOULD be deterministic (e.g., alphabetical by workflow ID). + +--- + +## 10. Error Handling + +### 10.1 Authentication Errors + +If the GitHub API returns an authentication error (HTTP 401 or 403): + +- **R-ERR-001**: The implementation MUST emit a descriptive error message to stderr indicating the authentication failure and guidance on re-authenticating with `gh auth login`. +- **R-ERR-002**: The implementation MUST exit with code `2`. + +### 10.2 API Rate Limiting + +If the GitHub API returns a rate-limit response (HTTP 429 or a `X-RateLimit-Remaining: 0` header): + +- **R-ERR-010**: The implementation SHOULD retry the request after the period indicated by the `X-RateLimit-Reset` header. +- **R-ERR-011**: The implementation MUST emit a warning to stderr when entering a rate-limit wait state. +- **R-ERR-012**: If retry is not feasible, the implementation MUST exit with a non-zero status and a message indicating the rate limit condition. + +### 10.3 Partial Failures + +When one or more workflows in the discovery set encounter individual errors (e.g., artifact download failure, API timeout for a specific workflow): + +- **R-ERR-020**: The implementation MUST continue processing the remaining workflows rather than aborting the entire forecast. +- **R-ERR-021**: Workflows that encountered individual errors MUST appear in output with `sampled_runs: 0` and all projection fields zeroed. +- **R-ERR-022**: The implementation MUST emit a warning to stderr for each workflow that encountered an individual error. + +### 10.4 No Workflows Discovered + +If workflow discovery yields zero workflows: + +- **R-ERR-030**: The implementation MUST emit a message to stderr indicating that no agentic workflows were found and describing the discovery mode used. +- **R-ERR-031**: The implementation MUST exit with code `3`. + +### 10.5 Verbose Diagnostics + +When `--verbose` is specified, the implementation SHOULD emit the following additional diagnostic information to stderr: + +- The list of discovered workflows and their identifiers +- The number of runs fetched per workflow +- The number of runs with valid ET data versus missing artifacts +- The computed `λ` (Poisson rate) for each workflow +- Timing information for API calls and simulation execution + +--- + +## 11. Implementation Requirements + +### 11.1 Randomness + +- **R-IMPL-001**: The Monte Carlo engine MUST use a cryptographically seeded pseudorandom number generator (PRNG). Implementations MUST NOT use a fixed seed unless in test mode. +- **R-IMPL-002**: The PRNG MUST be seeded independently per forecast invocation to ensure different results on repeated calls. + +### 11.2 Performance + +- **R-IMPL-010**: The 10,000-trial simulation for a single workflow MUST complete within 500 milliseconds on a single CPU core with a sample size of 100 runs. +- **R-IMPL-011**: Multiple workflows SHOULD be forecasted concurrently where the runtime environment supports parallelism. +- **R-IMPL-012**: API calls for data sampling SHOULD be made concurrently across workflows, subject to GitHub API rate limit constraints. + +### 11.3 Deterministic Output + +- **R-IMPL-020**: Given a fixed sample and fixed PRNG seed (in test mode), the Monte Carlo output MUST be reproducible. This requirement applies to test and validation scenarios only; production invocations MUST use random seeds (R-IMPL-001). + +### 11.4 Numeric Precision + +- **R-IMPL-030**: All intermediate ET computations MUST use 64-bit floating-point arithmetic (IEEE 754 double precision). +- **R-IMPL-031**: JSON serialization of numeric fields MUST NOT produce non-finite values (`NaN`, `+Inf`, `-Inf`). If a computation produces a non-finite value, it MUST be replaced with `0` and a warning MUST be emitted. +- **R-IMPL-032**: Implementations MUST NOT round projected ET values in intermediate computations; rounding for display purposes MUST occur only at serialization time. + +### 11.5 Experimental Status Behavior + +Because the forecast command is marked **Experimental**: + +- **R-IMPL-040**: The implementation MUST emit a warning to stderr on every invocation indicating the experimental status of the command unless `--json` is specified (JSON callers are assumed to be automated pipelines that handle warnings separately). +- **R-IMPL-041**: The JSON output schema MAY have new fields added in minor versions without notice. Callers MUST treat unknown fields as ignorable. + +--- + +## 12. Compliance Testing + +### 12.1 Test Suite Requirements + +#### 12.1.1 Command Interface Tests + +- **T-FC-001**: Invocation with invalid `--days` value exits non-zero with descriptive error. +- **T-FC-002**: Invocation with invalid `--period` value exits non-zero with descriptive error. +- **T-FC-003**: Invocation with `--sample < 1` exits non-zero. +- **T-FC-004**: Invocation with invalid `--repo` format exits non-zero. +- **T-FC-005**: Unmatched `workflow_id` positional argument exits non-zero with identification of the unmatched value. + +#### 12.1.2 Workflow Discovery Tests + +- **T-FC-010**: Local mode: discovers workflows from `.github/workflows/*.lock.yml`. +- **T-FC-011**: Local mode: no lock files found exits with code `3`. +- **T-FC-012**: Remote mode: calls GitHub Actions API and matches workflow IDs case-insensitively. +- **T-FC-013**: Remote mode: missing frontmatter fields default to zero/empty without error. + +#### 12.1.3 Data Sampling Tests + +- **T-FC-020**: Sampling respects `--sample` limit. +- **T-FC-021**: Sampling respects `--days` historical window cutoff. +- **T-FC-022**: Run with missing `aw_info.json` artifact contributes zero ET and is still counted in `sampled_runs`. +- **T-FC-023**: Workflow with zero sampled runs produces nil projection with zero fields. + +#### 12.1.4 Monte Carlo Engine Tests + +- **T-FC-030**: With `λ ≤ 15`, Knuth's algorithm is used for Poisson draw (verifiable by seeded PRNG in test mode). +- **T-FC-031**: With `λ > 15`, Normal approximation is used; drawn value is non-negative. +- **T-FC-032**: With `λ = 0`, projected tokens is exactly `0` for all trials. +- **T-FC-033**: Bootstrap resampling draws with replacement from historical ET observations. +- **T-FC-034**: Only successful Bernoulli draws contribute ET to the trial total. +- **T-FC-035**: 10,000 trials are executed per workflow. +- **T-FC-036**: P10 ≤ P50 ≤ P90 for all non-zero projections. +- **T-FC-037**: `projected_effective_tokens` equals `p50_projected_effective_tokens`. + +#### 12.1.5 Episode Analysis Tests + +- **T-FC-040**: Runs sharing `headSha` and `headBranch` are grouped into the same episode. +- **T-FC-041**: `runs_per_episode` equals `sampled_run_count / sampled_episodes`. +- **T-FC-042**: Episode table is printed in console output when any workflow has `runs_per_episode > 1`. +- **T-FC-043**: Episode table is suppressed when all workflows have `runs_per_episode = 1.0`. + +#### 12.1.6 Output Format Tests + +- **T-FC-050**: Console output contains all required columns. +- **T-FC-051**: JSON output is valid JSON conforming to the schema in Section 9.2. +- **T-FC-052**: JSON `as_of` field is a valid RFC 3339 UTC timestamp. +- **T-FC-053**: JSON `workflows` array is sorted by `projected_effective_tokens` descending. +- **T-FC-054**: No stdout output (other than JSON) when `--json` is specified. +- **T-FC-055**: Experimental warning emitted to stderr unless `--json` is specified. + +### 12.2 Compliance Checklist + +| Requirement | Test ID | Level | Status | +|---|---|---|---| +| Flag validation | T-FC-001–005 | 1 | Required | +| Local workflow discovery | T-FC-010–011 | 1 | Required | +| Remote workflow discovery | T-FC-012–013 | 2 | Required | +| Data sampling with limit and window | T-FC-020–021 | 1 | Required | +| Missing artifact graceful handling | T-FC-022 | 1 | Required | +| Nil projection for empty sample | T-FC-023 | 1 | Required | +| Knuth Poisson algorithm (λ ≤ 15) | T-FC-030 | 1 | Required | +| Normal approximation (λ > 15) | T-FC-031 | 1 | Required | +| Zero-λ projection | T-FC-032 | 1 | Required | +| Bootstrap resampling | T-FC-033 | 1 | Required | +| Bernoulli success filtering | T-FC-034 | 1 | Required | +| 10,000 trial count | T-FC-035 | 1 | Required | +| Percentile ordering | T-FC-036 | 1 | Required | +| P50 field consistency | T-FC-037 | 1 | Required | +| Episode grouping | T-FC-040–041 | 2 | Required | +| Episode table display logic | T-FC-042–043 | 2 | Required | +| Console output columns | T-FC-050 | 1 | Required | +| JSON schema conformance | T-FC-051–054 | 2 | Required | +| Experimental status warning | T-FC-055 | 1 | Required | + +--- + +## Appendices + +### Appendix A: Worked Example + +#### A.1 Scenario + +A workflow named `ci-doctor` has the following historical sample over 30 days: + +- 42 completed runs +- 5 runs missing `aw_info.json` (treated as 0 ET) +- ET observations (for the 37 runs with artifacts): range from 8,000 to 18,000, mean ≈ 12,500 +- 38 successful runs (yield = 38/42 ≈ 0.905) +- Projection period: `month` (30 days) + +#### A.2 Observed Rate + +``` +observed_runs_per_period = (42 / 30) × 30 = 42.0 +λ = 42.0 +``` + +Since λ > 15, Normal approximation is used: `Normal(μ=42, σ=√42 ≈ 6.48)`. + +#### A.3 Single Trial + +Draw `k ~ round(Normal(42, 6.48)) = 44` (example). + +For each of the 44 runs: +1. Draw success: `Bernoulli(0.905)` → say 40 succeed. +2. For each of the 40 successful runs, draw one ET observation from the 37-item historical pool (bootstrap). +3. Sum the 40 ET draws. + +One trial might yield: 40 × 12,200 (average draw) ≈ 488,000 ET. + +#### A.4 After 10,000 Trials + +Sorted trial totals (example summary): + +``` +P10 ≈ 415,000 (10th percentile — lower bound of 80% CI) +P50 ≈ 479,000 (median — headline projection) +P90 ≈ 545,000 (90th percentile — upper bound of 80% CI) +mean ≈ 481,000 +std_dev ≈ 40,000 +``` + +### Appendix B: Poisson Algorithm Selection Rationale + +Knuth's exact Poisson algorithm is used for small λ (≤ 15) because it produces exact integer draws from the Poisson distribution without bias. For large λ, the Poisson distribution converges to a Normal distribution (`N(λ, λ)`), making the Normal approximation computationally efficient and sufficiently accurate. + +The threshold of λ = 15 is chosen as the crossover point where Normal approximation error is below 1% for the tails relevant to P10/P90 computation. Implementations MAY lower this threshold (e.g., to λ = 30) for greater accuracy at a minor performance cost. + +### Appendix C: Bootstrap Resampling Rationale + +Traditional projection models assume a parametric distribution (e.g., log-normal) for per-run token usage. Agentic workflow token usage is frequently multi-modal (e.g., simple tasks versus complex multi-step tasks) and exhibits heavy tails due to recursive sub-agent chains. Bootstrap resampling avoids distributional misspecification by directly sampling from the empirical distribution, preserving these characteristics faithfully. The tradeoff is that projections are bounded by observed extremes; extrapolation beyond observed maximum ET requires explicit assumption and is out of scope for this specification. + +### Appendix D: Episode Count Lower-Bound Semantics + +For orchestrator workflows that primarily use `workflow_call` or `workflow_dispatch` triggers, episodes are initiated by calls from another workflow rather than directly by GitHub events. These cross-workflow links are embedded in `aw_info.json` artifacts and are unavailable during forecasting when artifacts cannot be retrieved. As a result, each received `workflow_call` is counted as a separate episode, causing the episode count to overcount episodes and undercount the linkage. This means `runs_per_episode` may appear closer to `1.0` than its true value. Callers MUST treat `sampled_episodes` as a lower-bound estimate in this scenario and SHOULD note this limitation in any capacity planning documents. + +### Appendix E: Security Considerations + +- **Credential scope**: The forecast command accesses the GitHub Actions API using the credentials of the `gh` CLI. Token permissions MUST include `actions:read` for the target repository. Callers SHOULD use the minimum necessary scope. +- **Artifact content**: The `aw_info.json` artifact MAY contain sensitive information such as prompt fragments embedded in ET metadata. Implementations MUST NOT log artifact payloads at verbosity levels accessible to non-administrative users. +- **Remote repository access**: When `--repo` targets a repository the caller does not own, the caller MUST have explicit read access. The implementation MUST NOT attempt to bypass or circumvent repository access controls. +- **JSON output**: The JSON output schema exposes token consumption patterns that MAY reveal information about system architecture and model configuration. JSON output SHOULD be treated as internal operational data and not exposed publicly. + +--- + +## References + +### Normative References + +- **[RFC 2119]** Bradner, S., "Key words for use in RFCs to Indicate Requirement Levels", BCP 14, RFC 2119, March 1997. +- **[RFC 3339]** Klyne, G. and Newman, C., "Date and Time on the Internet: Timestamps", RFC 3339, July 2002. +- **[ET-SPEC]** GitHub Agentic Workflows Team, "Effective Tokens Specification". [effective-tokens-specification](/gh-aw/reference/effective-tokens-specification/) +- **[EXP-SPEC]** GitHub Agentic Workflows Team, "A/B Experiments Specification". [experiments-specification](/gh-aw/reference/experiments-specification/) + +### Informative References + +- **[KNUTH-TAOCP]** Knuth, D.E., "The Art of Computer Programming, Volume 2: Seminumerical Algorithms", 3rd edition. Section 3.4.1 (Poisson distribution generation algorithm). +- **[BOOTSTRAP]** Efron, B. and Tibshirani, R., "An Introduction to the Bootstrap", Chapman & Hall, 1993. +- **[GH-ACTIONS-API]** GitHub, "GitHub Actions REST API Reference". + +--- + +## Change Log + +### Version 0.1.0 (Experimental Draft) + +- Initial specification for `gh aw forecast` command +- Defined command interface: flags `--days`, `--period`, `--sample`, `--repo`, `--json`, `--verbose` +- Defined local and remote workflow discovery modes +- Defined data sampling procedure and per-run metric derivation +- Defined Monte Carlo projection engine with Poisson + bootstrap algorithm +- Defined episode analysis with lower-bound semantics for orchestrator workflows +- Defined console table output format +- Defined JSON output schema (Sections 9.2.1–9.2.6) +- Defined error handling and exit codes +- Defined compliance test suite (T-FC-001 through T-FC-055) +- Added appendices: worked example, algorithm rationale, security considerations + +--- + +*Copyright © 2026 GitHub Agentic Workflows Team. All rights reserved.* diff --git a/pkg/cli/forecast.go b/pkg/cli/forecast.go new file mode 100644 index 00000000000..00c5d789777 --- /dev/null +++ b/pkg/cli/forecast.go @@ -0,0 +1,1059 @@ +package cli + +// This file implements the `forecast` command, which samples a workflow's recent +// GitHub Actions run history and projects forward effective token usage and yield +// on a per-week or per-month basis. +// +// Workflow metadata (trigger types, concurrency, experiments) is read from the +// workflow's Markdown frontmatter so that projections account for how often the +// workflow is actually expected to fire and how many concurrent runs it supports. + +import ( + "encoding/json" + "fmt" + "math" + "math/rand" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "github.com/github/gh-aw/pkg/console" + "github.com/github/gh-aw/pkg/constants" + "github.com/github/gh-aw/pkg/gitutil" + "github.com/github/gh-aw/pkg/logger" + "github.com/github/gh-aw/pkg/parser" + "github.com/github/gh-aw/pkg/workflow" +) + +var forecastRunLog = logger.New("cli:forecast_run") + +// forecastPeriodDays maps period names to the number of days in a projection window. +var forecastPeriodDays = map[string]int{ + "week": 7, + "month": 30, +} + +// ForecastEpisodeSummary contains episode-level aggregate metrics derived from +// run history without downloading artifacts. Episodes are reconstructed from the +// fields available in the GitHub Actions run list (event type, head SHA, branch). +// Dispatch and workflow_call linkages that require aw_info.json are not available +// in this lightweight analysis, so the episode count is a lower-bound estimate. +type ForecastEpisodeSummary struct { + // SampledEpisodes is the number of distinct episodes detected in the sampled + // run history. Each "episode" represents one logical task execution, which may + // span multiple runs when a workflow dispatches sub-workflows. + SampledEpisodes int `json:"sampled_episodes"` + // RunsPerEpisode is the average number of runs per episode (SampledRuns / + // SampledEpisodes). Values > 1 indicate orchestrator-style workflows that + // dispatch multiple sub-workflows per task. + RunsPerEpisode float64 `json:"runs_per_episode"` + // AvgEffectiveTokensPerEpisode is the mean effective-token count per episode. + AvgEffectiveTokensPerEpisode int `json:"avg_effective_tokens_per_episode"` + // ObservedEpisodesPerPeriod is the projected number of episodes in the forecast + // period, scaled from the observed episode frequency. + ObservedEpisodesPerPeriod float64 `json:"observed_episodes_per_period"` +} + +// ForecastWorkflowResult contains the projected metrics for a single workflow. +type ForecastWorkflowResult struct { + // WorkflowID is the short identifier of the workflow (basename without .md). + WorkflowID string `json:"workflow_id"` + // Period is the projection window ("week" or "month"). + Period string `json:"period"` + // SampledRuns is the number of completed runs used to derive per-run averages. + SampledRuns int `json:"sampled_runs"` + // HistoryDays is the number of calendar days covered by the sampled runs. + HistoryDays int `json:"history_days"` + + // Observed run frequency (derived from sampled run history). + ObservedRunsPerPeriod float64 `json:"observed_runs_per_period"` + + // SuccessRate is the fraction of sampled runs that completed successfully (0–1). + SuccessRate float64 `json:"success_rate"` + // Yield is the effective throughput: success rate × observed runs per period. + Yield float64 `json:"yield"` + + // Average per-run metrics (from completed runs). + AvgEffectiveTokens int `json:"avg_effective_tokens"` + AvgDurationSeconds float64 `json:"avg_duration_seconds"` + + // Projected totals for the period. + ProjectedEffectiveTokens int `json:"projected_effective_tokens"` + + // EpisodeAnalysis contains episode-level metrics derived from the sampled runs. + // Nil when no completed runs were available to analyze. + EpisodeAnalysis *ForecastEpisodeSummary `json:"episode_analysis,omitempty"` + + // MonteCarlo contains the probability distribution of projected effective-token + // counts derived from a Monte Carlo simulation (10 000 trials). + // Nil when no completed runs were available. + MonteCarlo *ForecastMonteCarloSummary `json:"monte_carlo,omitempty"` + + // Trigger information derived from frontmatter. + ActiveTriggers []string `json:"active_triggers"` + // ConcurrencyLimit is the workflow-level concurrency limit (0 = unlimited). + ConcurrencyLimit int `json:"concurrency_limit"` + + // ExperimentVariants contains per-variant forecasts when the workflow defines A/B + // experiments. Nil when no experiments are present. + ExperimentVariants []ForecastVariantResult `json:"experiment_variants,omitempty"` + + // Evaluation contains backtesting quality metrics when --eval is set. + // Nil in normal forecast mode. + Evaluation *ForecastEvaluation `json:"evaluation,omitempty"` +} + +// ForecastVariantResult contains projected metrics split by A/B experiment variant. +type ForecastVariantResult struct { + ExperimentName string `json:"experiment_name"` + Variant string `json:"variant"` + RunCount int `json:"run_count"` + Fraction float64 `json:"fraction"` +} + +// ForecastEvaluation contains the quality metrics for a backtested forecast. +// It is populated only when --eval is set. The training window ends one +// projection period before now; the validation window is the most recent period. +type ForecastEvaluation struct { + // TrainingStartDate is the ISO-8601 date the training window began. + TrainingStartDate string `json:"training_start_date"` + // TrainingEndDate is the ISO-8601 date the training window ended + // (= the start of the validation window). + TrainingEndDate string `json:"training_end_date"` + // ValidationEndDate is the ISO-8601 date the validation window ended (= today). + ValidationEndDate string `json:"validation_end_date"` + + // ActualRuns is the number of completed runs observed in the validation window. + ActualRuns int `json:"actual_runs"` + // ActualEffectiveTokens is the total effective-token count actually consumed + // in the validation window. + ActualEffectiveTokens int `json:"actual_effective_tokens"` + + // P50ErrorAbs is the signed difference (actual − P50 forecast) in effective tokens. + // Positive = actual was higher than forecast; negative = forecast over-estimated. + P50ErrorAbs int `json:"p50_error_abs"` + // P50ErrorPct is P50ErrorAbs as a percentage of the P50 forecast. + // NaN-safe: 0 when P50 is 0. + P50ErrorPct float64 `json:"p50_error_pct"` + // InCI is true when ActualEffectiveTokens fell within the P10–P90 confidence + // interval. A well-calibrated model should be in-CI ~80% of the time. + InCI bool `json:"in_ci"` +} + +// ForecastResult is the top-level output of the forecast command. +type ForecastResult struct { + Period string `json:"period"` + AsOf string `json:"as_of"` + EvalMode bool `json:"eval_mode,omitempty"` + Workflows []ForecastWorkflowResult `json:"workflows"` +} + +// RunForecast is the entry point for the forecast command. +func RunForecast(config ForecastConfig) error { + forecastRunLog.Printf("Running forecast: workflows=%v, days=%d, period=%s, eval=%v", config.WorkflowIDs, config.Days, config.Period, config.EvalMode) + + // Emit experimental warning so users know this command is not yet stable. + fmt.Fprintln(os.Stderr, console.FormatWarningMessage("forecast is an experimental command and may change without notice")) + + // Validate period. + periodDays, ok := forecastPeriodDays[config.Period] + if !ok { + return fmt.Errorf("invalid period %q: must be 'week' or 'month'", config.Period) + } + if config.Days != 7 && config.Days != 30 { + return fmt.Errorf("invalid days value: %d; must be 7 or 30", config.Days) + } + if config.SampleSize <= 0 { + config.SampleSize = 100 + } + + // Resolve the list of workflow IDs to forecast. + workflowIDs, err := resolveForecastWorkflows(config) + if err != nil { + return err + } + if len(workflowIDs) == 0 { + fmt.Fprintln(os.Stderr, console.FormatWarningMessage("No agentic workflows found to forecast")) + return nil + } + + now := time.Now() + + // In eval mode, shift the entire date range back by one period so we can + // compare the forecast against the actual runs in the most recent period. + // + // ┌──────────────────────────────────────────────────────────────────┐ + // │ [anchor - days ... anchor] training │ [anchor ... now] val │ + // └──────────────────────────────────────────────────────────────────┘ + // anchor = now - periodDays + // + // Normal mode: startDate = now - days (no anchor shift). + var anchor time.Time + var validationStartDate, validationEndDate string + if config.EvalMode { + anchor = now.AddDate(0, 0, -periodDays) + validationStartDate = anchor.Format("2006-01-02") + validationEndDate = now.Format("2006-01-02") + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("Eval mode: training window ends %s; validation window %s → %s", + anchor.Format("2006-01-02"), validationStartDate, validationEndDate))) + } + + startDate := now.AddDate(0, 0, -config.Days).Format("2006-01-02") + if config.EvalMode { + // Training window ends at the anchor, not now. + startDate = anchor.AddDate(0, 0, -config.Days).Format("2006-01-02") + } + + if !config.Verbose && !config.JSONOutput { + label := fmt.Sprintf("Forecasting %d workflow(s) using %d-day history → projecting per %s", + len(workflowIDs), config.Days, config.Period) + fmt.Fprintf(os.Stderr, "%s\n", console.FormatInfoMessage(label)) + } + + spinner := console.NewSpinner("Sampling workflow run history…") + if !config.Verbose { + spinner.Start() + } + + results := make([]ForecastWorkflowResult, 0, len(workflowIDs)) + for _, wfID := range workflowIDs { + if !config.Verbose { + spinner.UpdateMessage(fmt.Sprintf("Sampling %s…", wfID)) + } + + // forecastWorkflow uses the shifted startDate; in eval mode we also pass the + // anchor so the function knows where the training window ends. + result, err := forecastWorkflow(wfID, startDate, config, periodDays) + if err != nil { + if !config.Verbose { + spinner.Stop() + } + fmt.Fprintln(os.Stderr, console.FormatWarningMessage( + fmt.Sprintf("Skipping %s: %v", wfID, err))) + if !config.Verbose { + spinner.Start() + } + continue + } + + // In eval mode, fetch the validation-window runs and attach evaluation metrics. + if config.EvalMode { + result.Evaluation = evaluateForecast(wfID, result, validationStartDate, validationEndDate, config) + } + + results = append(results, result) + } + + if !config.Verbose { + spinner.Stop() + } + + // Sort results by Monte Carlo P50 (or point estimate when MC unavailable) descending. + sort.Slice(results, func(i, j int) bool { + pi := results[i].ProjectedEffectiveTokens + if mc := results[i].MonteCarlo; mc != nil { + pi = mc.P50ProjectedEffectiveTokens + } + pj := results[j].ProjectedEffectiveTokens + if mc := results[j].MonteCarlo; mc != nil { + pj = mc.P50ProjectedEffectiveTokens + } + return pi > pj + }) + + output := ForecastResult{ + Period: config.Period, + AsOf: now.UTC().Format(time.RFC3339), + EvalMode: config.EvalMode, + Workflows: results, + } + + if config.JSONOutput { + return renderForecastJSON(output) + } + return renderForecastTable(output, config) +} + +// resolveForecastWorkflows returns the ordered list of workflow IDs to forecast. +// When WorkflowIDs is empty, all agentic workflow IDs in the repository are returned. +// When RepoOverride is set, workflows are discovered via the GitHub API instead of local files. +func resolveForecastWorkflows(config ForecastConfig) ([]string, error) { + if config.RepoOverride != "" { + return resolveForecastWorkflowsFromRemote(config.WorkflowIDs, config.RepoOverride, config.Verbose) + } + + if len(config.WorkflowIDs) > 0 { + // Resolve each provided ID to a canonical lock-file workflow name. + resolved := make([]string, 0, len(config.WorkflowIDs)) + for _, id := range config.WorkflowIDs { + name, err := workflow.FindWorkflowName(id) + if err != nil { + return nil, fmt.Errorf("workflow %q not found: %w", id, err) + } + resolved = append(resolved, name) + } + return resolved, nil + } + + // No explicit IDs: discover all agentic workflows from .lock.yml files. + names, err := getAgenticWorkflowNames(config.Verbose) + if err != nil { + return nil, fmt.Errorf("failed to discover agentic workflows: %w", err) + } + return names, nil +} + +// resolveForecastWorkflowsFromRemote resolves workflow names for a remote repository using +// the GitHub API. When ids is empty, all workflows in the remote repository are returned. +// When ids are provided, each is matched (case-insensitively) against remote workflow names +// and file-path basenames. +func resolveForecastWorkflowsFromRemote(ids []string, repoOverride string, verbose bool) ([]string, error) { + githubWorkflows, err := fetchGitHubWorkflows(repoOverride, verbose) + if err != nil { + return nil, fmt.Errorf("failed to list workflows in %s: %w", repoOverride, err) + } + + if len(ids) == 0 { + // Return display names for all workflows in the remote repo. + names := make([]string, 0, len(githubWorkflows)) + for _, wf := range githubWorkflows { + names = append(names, wf.Name) + } + sort.Strings(names) + return names, nil + } + + // Match each provided ID against the remote workflow list. + resolved := make([]string, 0, len(ids)) + for _, id := range ids { + matched := matchRemoteWorkflowName(id, githubWorkflows) + if matched == "" { + return nil, fmt.Errorf("workflow %q not found in %s", id, repoOverride) + } + resolved = append(resolved, matched) + } + return resolved, nil +} + +// matchRemoteWorkflowName returns the display name of the workflow in the remote map that +// best matches id. Matching is tried against the file-based key (e.g. "ci-doctor") and the +// display name (e.g. "CI Failure Doctor"), both case-insensitively. Returns "" on no match. +func matchRemoteWorkflowName(id string, workflows map[string]*GitHubWorkflow) string { + lowerID := strings.ToLower(id) + for key, wf := range workflows { + if strings.ToLower(key) == lowerID || strings.ToLower(wf.Name) == lowerID { + return wf.Name + } + } + return "" +} + +// forecastWorkflow computes a ForecastWorkflowResult for a single workflow. +func forecastWorkflow(workflowName, startDate string, config ForecastConfig, periodDays int) (ForecastWorkflowResult, error) { + result := ForecastWorkflowResult{ + WorkflowID: extractWorkflowIDFromName(workflowName), + Period: config.Period, + HistoryDays: config.Days, + } + + // Load frontmatter metadata (triggers, concurrency, experiments). + meta := loadWorkflowMeta(workflowName, config.Verbose) + result.ActiveTriggers = meta.activeTriggers + result.ConcurrencyLimit = meta.concurrencyLimit + result.ExperimentVariants = meta.variants + + // Determine the API name used to filter workflow runs (prefer lock file name). + apiName := workflowName + if lockFile, err := workflow.GetWorkflowLockFileName(workflowName); err == nil { + apiName = lockFile + } + + // Fetch completed runs from the history window. + opts := ListWorkflowRunsOptions{ + WorkflowName: apiName, + StartDate: startDate, + Limit: config.SampleSize, + RepoOverride: config.RepoOverride, + Verbose: config.Verbose, + } + + runs, _, err := listWorkflowRunsWithPagination(opts) + if err != nil { + if gitutil.IsRateLimitError(err.Error()) { + fmt.Fprintln(os.Stderr, console.FormatWarningMessage( + fmt.Sprintf("Skipping %s: GitHub API rate limit exceeded", result.WorkflowID))) + return result, nil + } + return result, err + } + + // Only use completed runs for metric computation. + completed := make([]WorkflowRun, 0, len(runs)) + for _, r := range runs { + if r.Status == "completed" { + // Compute Duration from StartedAt/UpdatedAt when not already set (gh run list + // does not populate the Duration field; health_command uses the same approach). + if r.Duration == 0 && !r.StartedAt.IsZero() && !r.UpdatedAt.IsZero() { + r.Duration = r.UpdatedAt.Sub(r.StartedAt) + } + // Enrich with ET from a locally-cached run summary when available. + // gh run list does not return token-usage fields; they are only stored in + // the aw_info.json artifacts downloaded by `gh aw logs`. Loading the cached + // RunSummary avoids re-downloading artifacts while still providing accurate + // ET observations for runs that have already been processed locally. + if r.EffectiveTokens == 0 { + r.EffectiveTokens = loadCachedEffectiveTokens(r.DatabaseID, config.Verbose) + } + completed = append(completed, r) + } + } + result.SampledRuns = len(completed) + + if len(completed) == 0 { + forecastRunLog.Printf("No completed runs found for %s in last %d days", workflowName, config.Days) + return result, nil + } + + // Compute per-run averages. + var totalET int + var totalDurSec float64 + successCount := 0 + etObservations := make([]int, 0, len(completed)) + + for _, r := range completed { + totalET += r.EffectiveTokens + totalDurSec += r.Duration.Seconds() + etObservations = append(etObservations, r.EffectiveTokens) + if r.Conclusion == "success" { + successCount++ + } + } + + n := len(completed) + result.AvgEffectiveTokens = totalET / n + result.AvgDurationSeconds = totalDurSec / float64(n) + result.SuccessRate = float64(successCount) / float64(n) + + // Compute observed run frequency: runs per calendar day over the history window, + // scaled to the projection period. + result.ObservedRunsPerPeriod = float64(n) / float64(config.Days) * float64(periodDays) + + // Effective throughput (yield) accounts for the success rate. + result.Yield = result.ObservedRunsPerPeriod * result.SuccessRate + + // Projected token usage (point estimate using simple means). + result.ProjectedEffectiveTokens = int(math.Round(result.ObservedRunsPerPeriod * float64(result.AvgEffectiveTokens))) + + // Monte Carlo simulation: model run-count (Poisson), per-run token usage + // (bootstrap), and per-run success (Bernoulli) to produce P10/P50/P90 ranges. + rng := rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec // non-cryptographic simulation RNG + result.MonteCarlo = runMonteCarlo(etObservations, successCount, result.ObservedRunsPerPeriod, rng) + + // Populate experiment variant fractions from run history when metadata has variants. + result.ExperimentVariants = computeVariantFractions(result.ExperimentVariants, completed) + + // Build lightweight episode analysis from the completed runs using the fields + // available in the GitHub Actions run list (no artifact download required). + result.EpisodeAnalysis = buildForecastEpisodeSummary(completed, config.Days, periodDays) + + return result, nil +} + +// workflowMeta holds parsed metadata from a workflow's Markdown frontmatter. +type workflowMeta struct { + activeTriggers []string + concurrencyLimit int + variants []ForecastVariantResult +} + +// loadWorkflowMeta reads the workflow's Markdown file and extracts frontmatter metadata. +// Errors are non-fatal; a partial result is returned on failure. +func loadWorkflowMeta(workflowName string, verbose bool) workflowMeta { + meta := workflowMeta{} + + // Try to find the Markdown source file. + mdFile := findMarkdownFileForWorkflow(workflowName) + if mdFile == "" { + forecastRunLog.Printf("Markdown file not found for workflow %q", workflowName) + return meta + } + + content, err := os.ReadFile(mdFile) + if err != nil { + forecastRunLog.Printf("Failed to read Markdown file %q: %v", mdFile, err) + return meta + } + + result, err := parser.ExtractFrontmatterFromContent(string(content)) + if err != nil || result.Frontmatter == nil { + forecastRunLog.Printf("Failed to parse frontmatter for %q: %v", workflowName, err) + return meta + } + + cfg, err := workflow.ParseFrontmatterConfig(result.Frontmatter) + if err != nil || cfg == nil { + forecastRunLog.Printf("Failed to build FrontmatterConfig for %q: %v", workflowName, err) + return meta + } + + // Collect active trigger names. + meta.activeTriggers = extractTriggerNames(cfg) + + // Concurrency limit: read the `cancel-in-progress` or derive from the concurrency map. + meta.concurrencyLimit = extractConcurrencyLimit(cfg) + + // Collect experiment variant names (counts come from run history later). + meta.variants = extractExperimentVariantStubs(cfg) + + return meta +} + +// findMarkdownFileForWorkflow tries to locate the .md source file for a workflow. +func findMarkdownFileForWorkflow(workflowName string) string { + // workflowName might be a display name like "CI Doctor" or a lock file like "ci-doctor.lock.yml". + // Try to reverse-engineer the md file path. + candidates := []string{ + fmt.Sprintf(".github/workflows/%s.md", workflowName), + } + // Strip known suffixes. + for _, sfx := range []string{".lock.yml", ".yml", ".yaml"} { + if base, ok := strings.CutSuffix(workflowName, sfx); ok { + // Also strip ".lock" from lock files. + base, _ = strings.CutSuffix(base, ".lock") + candidates = append(candidates, fmt.Sprintf(".github/workflows/%s.md", base)) + } + } + + for _, c := range candidates { + if _, err := os.Stat(c); err == nil { + return c + } + } + return "" +} + +// extractTriggerNames returns the list of active trigger event names from a workflow config. +func extractTriggerNames(cfg *workflow.FrontmatterConfig) []string { + if cfg.On == nil { + return nil + } + names := make([]string, 0, len(cfg.On)) + for k := range cfg.On { + names = append(names, k) + } + sort.Strings(names) + return names +} + +// extractConcurrencyLimit returns the workflow-level concurrency limit. +// Returns 0 when unlimited (no concurrency config) and 1 when concurrency is configured +// (either via cancel-in-progress or a concurrency group, since GitHub Actions queues at +// most one pending run when a concurrency group is set). +func extractConcurrencyLimit(cfg *workflow.FrontmatterConfig) int { + if cfg.Concurrency == nil { + return 0 + } + // When concurrency is configured with cancel-in-progress: true, effective concurrency = 1. + if v, ok := cfg.Concurrency["cancel-in-progress"]; ok { + if b, _ := v.(bool); b { + return 1 + } + } + // When there's a concurrency group without cancel-in-progress, runs queue up; treat as 1 + // active at a time by convention (GitHub Actions queues at most one pending run). + if _, hasGroup := cfg.Concurrency["group"]; hasGroup { + return 1 + } + return 0 +} + +// extractExperimentVariantStubs extracts experiment variant metadata from frontmatter. +// Run counts are not yet known at this stage; they are populated from run history later. +func extractExperimentVariantStubs(cfg *workflow.FrontmatterConfig) []ForecastVariantResult { + if len(cfg.ExperimentConfigs) == 0 { + return nil + } + stubs := make([]ForecastVariantResult, 0) + for expName, expCfg := range cfg.ExperimentConfigs { + if expCfg == nil { + continue + } + for _, variant := range expCfg.Variants { + stubs = append(stubs, ForecastVariantResult{ + ExperimentName: expName, + Variant: variant, + }) + } + } + sort.Slice(stubs, func(i, j int) bool { + if stubs[i].ExperimentName != stubs[j].ExperimentName { + return stubs[i].ExperimentName < stubs[j].ExperimentName + } + return stubs[i].Variant < stubs[j].Variant + }) + return stubs +} + +// computeVariantFractions populates run counts and fractions on the variant stubs +// by examining the DisplayTitle of sampled runs (gh-aw encodes the variant there). +// When no stubs are present (workflow has no experiments), returns nil. +func computeVariantFractions(stubs []ForecastVariantResult, runs []WorkflowRun) []ForecastVariantResult { + if len(stubs) == 0 { + return nil + } + + total := len(runs) + if total == 0 { + return stubs + } + + // Count how many run titles contain each variant name. + for i, stub := range stubs { + count := 0 + for _, r := range runs { + if strings.Contains(r.DisplayTitle, stub.Variant) { + count++ + } + } + stubs[i].RunCount = count + stubs[i].Fraction = float64(count) / float64(total) + } + return stubs +} + +// extractWorkflowIDFromName returns the short workflow ID from a display/lock name. +func extractWorkflowIDFromName(name string) string { + for _, sfx := range []string{".lock.yml", ".yml", ".yaml"} { + if base, ok := strings.CutSuffix(name, sfx); ok { + base, _ = strings.CutSuffix(base, ".lock") + name = base + } + } + return name +} + +// workflowRunToRunData converts a WorkflowRun (sourced from the GitHub Actions API) +// to a RunData using the fields available without artifact downloads. Fields that +// require aw_info.json (AwContext, Repository, Ref, SHA, Actor, RunAttempt, …) are +// left as zero values; the episode engine degrades gracefully when they are absent. +func workflowRunToRunData(r WorkflowRun) RunData { + return RunData{ + RunID: r.DatabaseID, + Number: r.Number, + WorkflowName: r.WorkflowName, + WorkflowPath: r.WorkflowPath, + Status: r.Status, + Conclusion: r.Conclusion, + URL: r.URL, + Event: r.Event, + Branch: r.HeadBranch, + HeadSHA: r.HeadSha, + DisplayTitle: r.DisplayTitle, + CreatedAt: r.CreatedAt, + StartedAt: r.StartedAt, + UpdatedAt: r.UpdatedAt, + TokenUsage: r.TokenUsage, + EffectiveTokens: r.EffectiveTokens, + EstimatedCost: r.EstimatedCost, + } +} + +// buildForecastEpisodeSummary derives episode-level metrics from a slice of +// completed WorkflowRun objects using the lightweight episode engine. Returns nil +// when no runs are provided. +// +// Because only GitHub API fields are available (no aw_info.json artifacts), the +// episode engine can link runs via workflow_run event SHA/branch matching but +// cannot detect dispatch or workflow_call lineage. The resulting episode count is +// therefore a lower-bound estimate for orchestrator-style workflows. +func buildForecastEpisodeSummary(runs []WorkflowRun, historyDays, periodDays int) *ForecastEpisodeSummary { + if len(runs) == 0 { + return nil + } + + runData := make([]RunData, 0, len(runs)) + for _, r := range runs { + runData = append(runData, workflowRunToRunData(r)) + } + + // buildEpisodeData returns (episodes, edges); edges are not needed for + // the lightweight forecast summary so they are intentionally discarded. + episodes, _ := buildEpisodeData(runData, nil) + numEpisodes := len(episodes) + if numEpisodes == 0 { + return nil + } + + var totalEpisodeET int + for _, ep := range episodes { + totalEpisodeET += ep.TotalEffectiveTokens + } + + avgETPerEpisode := totalEpisodeET / numEpisodes + runsPerEpisode := float64(len(runs)) / float64(numEpisodes) + observedEpisodesPerPeriod := float64(numEpisodes) / float64(historyDays) * float64(periodDays) + + return &ForecastEpisodeSummary{ + SampledEpisodes: numEpisodes, + RunsPerEpisode: runsPerEpisode, + AvgEffectiveTokensPerEpisode: avgETPerEpisode, + ObservedEpisodesPerPeriod: observedEpisodesPerPeriod, + } +} + +// loadCachedEffectiveTokens looks up a locally-cached RunSummary for the given +// run ID and returns the TotalEffectiveTokens from its TokenUsage summary. +// Returns 0 when no cache exists or the cache does not contain token data. +// This avoids re-downloading aw_info.json artifacts for runs already processed by +// `gh aw logs` while still providing accurate ET observations for the simulation. +// +// Cache location: /run-/run_summary.json +// (defaultLogsOutputDir is ".github/aw/logs" — defined in logs_models.go) +func loadCachedEffectiveTokens(runID int64, verbose bool) int { + dir := filepath.Join(defaultLogsOutputDir, fmt.Sprintf("run-%d", runID)) + summary, ok := loadRunSummary(dir, verbose) + if !ok || summary == nil { + return 0 + } + if summary.TokenUsage != nil && summary.TokenUsage.TotalEffectiveTokens > 0 { + return summary.TokenUsage.TotalEffectiveTokens + } + // Fallback: legacy run summaries (written before TokenUsage was a separate + // field) may have stored the computed ET directly on the Run struct. + if summary.Run.EffectiveTokens > 0 { + return summary.Run.EffectiveTokens + } + return 0 +} + +// evaluateForecast fetches actual completed runs in the validation window and +// returns a ForecastEvaluation comparing them against the Monte Carlo forecast. +// +// validationStartDate / validationEndDate are ISO-8601 strings bracketing the +// period that was forecast (= one projection period immediately before now). +// Actual runs are fetched with the same pagination helper used for training, +// but with the validation date range. +func evaluateForecast(workflowName string, forecast ForecastWorkflowResult, validationStartDate, validationEndDate string, config ForecastConfig) *ForecastEvaluation { + // Compute the actual ISO-8601 training start date by subtracting HistoryDays + // from the validation start (= anchor). + var trainingStartDate string + if t, err := time.Parse("2006-01-02", validationStartDate); err == nil { + trainingStartDate = t.AddDate(0, 0, -forecast.HistoryDays).Format("2006-01-02") + } else { + trainingStartDate = validationStartDate + } + eval := &ForecastEvaluation{ + TrainingStartDate: trainingStartDate, + TrainingEndDate: validationStartDate, + ValidationEndDate: validationEndDate, + } + + // Determine the API name used to filter workflow runs. + apiName := workflowName + if lockFile, err := workflow.GetWorkflowLockFileName(workflowName); err == nil { + apiName = lockFile + } + + // Fetch completed runs in the validation window. + opts := ListWorkflowRunsOptions{ + WorkflowName: apiName, + StartDate: validationStartDate, + Limit: config.SampleSize, + RepoOverride: config.RepoOverride, + Verbose: config.Verbose, + } + runs, _, err := listWorkflowRunsWithPagination(opts) + if err != nil { + forecastRunLog.Printf("Eval: failed to fetch validation runs for %s: %v", workflowName, err) + return eval + } + + // Filter to completed runs that fall within the validation window. + validationEnd := time.Now() + validationStart, _ := time.Parse("2006-01-02", validationStartDate) + for _, r := range runs { + if r.Status != "completed" { + continue + } + // Skip runs with no timestamp — we cannot verify they belong to the + // validation window, so including them would introduce undefined bias. + if r.StartedAt.IsZero() { + continue + } + if r.StartedAt.Before(validationStart) || r.StartedAt.After(validationEnd) { + continue + } + if r.EffectiveTokens == 0 { + r.EffectiveTokens = loadCachedEffectiveTokens(r.DatabaseID, config.Verbose) + } + eval.ActualRuns++ + eval.ActualEffectiveTokens += r.EffectiveTokens + } + + // Compute error metrics against P50 (falls back to point estimate). + p50 := forecast.ProjectedEffectiveTokens + p10 := forecast.ProjectedEffectiveTokens + p90 := forecast.ProjectedEffectiveTokens + if mc := forecast.MonteCarlo; mc != nil { + p50 = mc.P50ProjectedEffectiveTokens + p10 = mc.P10ProjectedEffectiveTokens + p90 = mc.P90ProjectedEffectiveTokens + } + + eval.P50ErrorAbs = eval.ActualEffectiveTokens - p50 + if p50 > 0 { + eval.P50ErrorPct = float64(eval.P50ErrorAbs) / float64(p50) * 100 + } + eval.InCI = eval.ActualEffectiveTokens >= p10 && eval.ActualEffectiveTokens <= p90 + + return eval +} + +// ── Rendering ─────────────────────────────────────────────────────────────── + +// renderForecastJSON outputs the forecast result as pretty-printed JSON. +func renderForecastJSON(output ForecastResult) error { + b, err := json.MarshalIndent(output, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal forecast JSON: %w", err) + } + fmt.Println(string(b)) + return nil +} + +// forecastTableRow is a flattened struct used for console table rendering. +type forecastTableRow struct { + Workflow string `json:"workflow" console:"header:Workflow"` + Runs int `json:"runs" console:"header:Sampled Runs"` + SuccessRate string `json:"success_rate" console:"header:Success Rate"` + Yield string `json:"yield" console:"header:Yield/Period"` + AvgEffectiveTokens string `json:"avg_effective_tokens" console:"header:Avg ET"` + ProjectedTokens string `json:"projected_tokens" console:"header:Proj. ET (P50)"` + ETRange string `json:"et_range" console:"header:80% CI (P10–P90)"` + Triggers string `json:"triggers" console:"header:Triggers"` +} + +// renderForecastTable renders the forecast result as a human-readable table. +func renderForecastTable(output ForecastResult, config ForecastConfig) error { + periodLabel := strings.ToUpper(output.Period[:1]) + output.Period[1:] + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("Workflow Forecast — per %s (based on last %d days of history)", periodLabel, config.Days))) + fmt.Fprintln(os.Stderr, "") + + anyUnreliable := false + rows := make([]forecastTableRow, 0, len(output.Workflows)) + for _, wf := range output.Workflows { + // Use Monte Carlo P50 as the primary ET estimate when available. + projETStr := formatForecastTokens(wf.ProjectedEffectiveTokens) + etRangeStr := "-" + unreliableMark := "" + if mc := wf.MonteCarlo; mc != nil { + projETStr = formatForecastTokens(mc.P50ProjectedEffectiveTokens) + etRangeStr = fmt.Sprintf("%s–%s", + formatForecastTokens(mc.P10ProjectedEffectiveTokens), + formatForecastTokens(mc.P90ProjectedEffectiveTokens)) + if !mc.IsReliable { + anyUnreliable = true + unreliableMark = "*" + } + } + row := forecastTableRow{ + Workflow: wf.WorkflowID + unreliableMark, + Runs: wf.SampledRuns, + SuccessRate: formatForecastPercent(wf.SuccessRate, wf.SampledRuns > 0), + Yield: fmt.Sprintf("%.1f", wf.Yield), + AvgEffectiveTokens: formatForecastTokens(wf.AvgEffectiveTokens), + ProjectedTokens: projETStr, + ETRange: etRangeStr, + Triggers: formatTriggerList(wf.ActiveTriggers), + } + rows = append(rows, row) + } + + fmt.Fprint(os.Stderr, console.RenderStruct(rows)) + fmt.Fprintln(os.Stderr, "") + + // Show episode analysis when any workflow has multi-run episodes. + anyMultiRunEpisodes := false + for _, wf := range output.Workflows { + if wf.EpisodeAnalysis != nil && wf.EpisodeAnalysis.RunsPerEpisode > 1.0 { + anyMultiRunEpisodes = true + break + } + } + if anyMultiRunEpisodes { + printEpisodeBreakdown(output.Workflows) + } + + // Show experiment variant details when present. + for _, wf := range output.Workflows { + if len(wf.ExperimentVariants) > 0 { + printVariantBreakdown(wf) + } + } + + // Show backtesting evaluation table in --eval mode. + if output.EvalMode { + printEvalBreakdown(output.Workflows) + } + + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("P50 = median; 80%% CI = P10–P90 from %d-trial Monte Carlo simulation (Gamma–Poisson model accounts for rate estimation uncertainty).", monteCarloIterations))) + if anyUnreliable { + fmt.Fprintln(os.Stderr, console.FormatWarningMessage( + fmt.Sprintf("* Fewer than %d sampled runs — confidence intervals may be unreliable.", minObservationsForReliableForecast))) + } + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + fmt.Sprintf("Run '%s forecast --json' for full output.", string(constants.CLIExtensionPrefix)))) + return nil +} + +// printEpisodeBreakdown renders per-episode ET metrics for workflows that have +// multi-run episodes (i.e. orchestrator-style workflows dispatching sub-workflows). +func printEpisodeBreakdown(workflows []ForecastWorkflowResult) { + type episodeRow struct { + Workflow string `json:"workflow" console:"header:Workflow"` + Episodes int `json:"episodes" console:"header:Episodes"` + RunsPerEpisode string `json:"runs_per_episode" console:"header:Runs/Episode"` + AvgETPerEpisode string `json:"avg_et_per_episode" console:"header:Avg ET/Episode"` + EpisodesPerPeriod string `json:"episodes_per_period" console:"header:Episodes/Period"` + } + + fmt.Fprintln(os.Stderr, console.FormatInfoMessage("Episode analysis (runs grouped by logical task):")) + epRows := make([]episodeRow, 0, len(workflows)) + for _, wf := range workflows { + ep := wf.EpisodeAnalysis + if ep == nil { + continue + } + epRows = append(epRows, episodeRow{ + Workflow: wf.WorkflowID, + Episodes: ep.SampledEpisodes, + RunsPerEpisode: fmt.Sprintf("%.1f", ep.RunsPerEpisode), + AvgETPerEpisode: formatForecastTokens(ep.AvgEffectiveTokensPerEpisode), + EpisodesPerPeriod: fmt.Sprintf("%.1f", ep.ObservedEpisodesPerPeriod), + }) + } + fmt.Fprint(os.Stderr, console.RenderStruct(epRows)) + fmt.Fprintln(os.Stderr, "") +} + +// printEvalBreakdown renders the backtesting comparison table. +func printEvalBreakdown(workflows []ForecastWorkflowResult) { + type evalRow struct { + Workflow string `json:"workflow" console:"header:Workflow"` + ActualRuns int `json:"actual_runs" console:"header:Actual Runs"` + ActualET string `json:"actual_et" console:"header:Actual ET"` + ForecastP50 string `json:"forecast_p50" console:"header:Forecast P50"` + ErrorAbs string `json:"error_abs" console:"header:Error (abs)"` + ErrorPct string `json:"error_pct" console:"header:Error %"` + InCI string `json:"in_ci" console:"header:In 80% CI?"` + } + + fmt.Fprintln(os.Stderr, console.FormatInfoMessage("Backtesting evaluation (actual vs forecasted):")) + var rows []evalRow + for _, wf := range workflows { + ev := wf.Evaluation + if ev == nil { + continue + } + p50 := wf.ProjectedEffectiveTokens + if mc := wf.MonteCarlo; mc != nil { + p50 = mc.P50ProjectedEffectiveTokens + } + inCI := "No" + if ev.InCI { + inCI = "Yes ✓" + } + rows = append(rows, evalRow{ + Workflow: wf.WorkflowID, + ActualRuns: ev.ActualRuns, + ActualET: formatForecastTokens(ev.ActualEffectiveTokens), + ForecastP50: formatForecastTokens(p50), + ErrorAbs: formatForecastSignedTokens(ev.P50ErrorAbs), + ErrorPct: fmt.Sprintf("%.1f%%", ev.P50ErrorPct), + InCI: inCI, + }) + } + fmt.Fprint(os.Stderr, console.RenderStruct(rows)) + fmt.Fprintln(os.Stderr, "") + fmt.Fprintln(os.Stderr, console.FormatInfoMessage( + "Training window ended at the forecast anchor; validation window is the following projection period.")) +} + + +func printVariantBreakdown(wf ForecastWorkflowResult) { + type variantRow struct { + Experiment string `json:"experiment" console:"header:Experiment"` + Variant string `json:"variant" console:"header:Variant"` + Runs int `json:"runs" console:"header:Runs"` + Fraction string `json:"fraction" console:"header:Fraction"` + } + + fmt.Fprintf(os.Stderr, " Experiment variants for %s:\n", wf.WorkflowID) + varRows := make([]variantRow, 0, len(wf.ExperimentVariants)) + for _, v := range wf.ExperimentVariants { + varRows = append(varRows, variantRow{ + Experiment: v.ExperimentName, + Variant: v.Variant, + Runs: v.RunCount, + Fraction: formatForecastPercent(v.Fraction, wf.SampledRuns > 0), + }) + } + fmt.Fprint(os.Stderr, console.RenderStruct(varRows)) + fmt.Fprintln(os.Stderr, "") +} + +// ── Format helpers ─────────────────────────────────────────────────────────── + +// formatForecastPercent formats v as a percentage string. +// hasData must be false when the underlying sample is empty (no runs), in which +// case "N/A" is returned; otherwise the value (including 0%) is formatted. +func formatForecastPercent(v float64, hasData bool) string { + if !hasData { + return "N/A" + } + return fmt.Sprintf("%.0f%%", v*100) +} + +func formatForecastTokens(n int) string { + if n == 0 { + return "-" + } + if n < 1000 { + return strconv.Itoa(n) + } + if n < 1_000_000 { + return fmt.Sprintf("%.1fK", float64(n)/1000) + } + return fmt.Sprintf("%.2fM", float64(n)/1_000_000) +} + +// formatForecastSignedTokens formats a signed integer token count, preserving +// the sign so callers can display positive/negative deltas (e.g., error abs). +func formatForecastSignedTokens(n int) string { + if n == 0 { + return "0" + } + sign := "" + v := n + if n < 0 { + sign = "-" + v = -n + } + return sign + formatForecastTokens(v) +} + +func formatTriggerList(triggers []string) string { + if len(triggers) == 0 { + return "-" + } + if len(triggers) <= 3 { + return strings.Join(triggers, ", ") + } + return strings.Join(triggers[:3], ", ") + fmt.Sprintf(" +%d", len(triggers)-3) +} diff --git a/pkg/cli/forecast_command.go b/pkg/cli/forecast_command.go new file mode 100644 index 00000000000..e01c4561710 --- /dev/null +++ b/pkg/cli/forecast_command.go @@ -0,0 +1,110 @@ +package cli + +import ( + "github.com/github/gh-aw/pkg/constants" + "github.com/spf13/cobra" +) + +// ForecastConfig holds configuration for forecast command execution. +type ForecastConfig struct { + // WorkflowIDs is the set of workflow IDs to forecast. When empty, all agentic + // workflows in the repository are included. + WorkflowIDs []string + // Days is the historical window used to sample workflow runs. + Days int + // Period controls the aggregation granularity: "week" or "month". + Period string + // JSONOutput enables machine-readable JSON output. + JSONOutput bool + // Verbose enables verbose diagnostic output. + Verbose bool + // RepoOverride optionally targets a different repository. + RepoOverride string + // SampleSize is the maximum number of completed runs to sample per workflow. + SampleSize int + // EvalMode enables backtesting mode: the training window is shifted back by + // one projection period and forecast quality is evaluated against the actual + // runs observed in that period. + EvalMode bool +} + +// NewForecastCommand creates the forecast command. +func NewForecastCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "forecast [workflow]...", + Short: "Forecast token usage and costs for agentic workflows (experimental)", + Long: `[EXPERIMENTAL] Forecast effective token usage for agentic workflows by sampling +recent run history and projecting forward on a per-week or per-month basis. + +The forecaster downloads a sample of recent completed workflow runs and derives +per-run metrics (effective tokens, duration, success rate). When runs have been +previously processed by 'gh aw logs', cached token-usage data is used. The +observed run frequency is then projected to the target period using a Monte Carlo +simulation that models three sources of uncertainty: run count (Poisson), per-run +token usage (bootstrap resampling), and per-run success (Bernoulli). + +Accounts for: + - A/B experiment variants (results are split per variant when present) + - Observed run frequency from GitHub Actions history + - Per-run success rate + +If no workflow arguments are provided, all agentic workflows in the repository +are included and displayed side-by-side for easy comparison. + +Multiple workflow IDs may be provided to compare specific workflows. + +Backtesting (--eval): + Shifts the training window back by one projection period, builds the forecast, + then measures actual runs in that period and computes quality metrics: + P50 absolute/percentage error and whether the actual value fell inside the + P10–P90 confidence interval. Use this to validate the model before relying on + forward projections. + +` + WorkflowIDExplanation + ` + +Examples: + ` + string(constants.CLIExtensionPrefix) + ` forecast # Forecast all workflows (monthly) + ` + string(constants.CLIExtensionPrefix) + ` forecast ci-doctor # Forecast a specific workflow + ` + string(constants.CLIExtensionPrefix) + ` forecast ci-doctor daily-planner # Compare two workflows + ` + string(constants.CLIExtensionPrefix) + ` forecast --period week # Weekly projections + ` + string(constants.CLIExtensionPrefix) + ` forecast --days 7 # Use 7-day history window + ` + string(constants.CLIExtensionPrefix) + ` forecast --sample 50 # Sample up to 50 runs per workflow + ` + string(constants.CLIExtensionPrefix) + ` forecast --json # Machine-readable JSON output + ` + string(constants.CLIExtensionPrefix) + ` forecast --repo owner/repo # Forecast in another repository + ` + string(constants.CLIExtensionPrefix) + ` forecast --eval # Backtest: evaluate forecast quality against past data`, + Args: cobra.ArbitraryArgs, + RunE: func(cmd *cobra.Command, args []string) error { + days, _ := cmd.Flags().GetInt("days") + period, _ := cmd.Flags().GetString("period") + jsonOutput, _ := cmd.Flags().GetBool("json") + verbose, _ := cmd.Flags().GetBool("verbose") + repoOverride, _ := cmd.Flags().GetString("repo") + sampleSize, _ := cmd.Flags().GetInt("sample") + evalMode, _ := cmd.Flags().GetBool("eval") + + config := ForecastConfig{ + WorkflowIDs: args, + Days: days, + Period: period, + JSONOutput: jsonOutput, + Verbose: verbose, + RepoOverride: repoOverride, + SampleSize: sampleSize, + EvalMode: evalMode, + } + + return RunForecast(config) + }, + } + + cmd.Flags().Int("days", 30, "Historical window in days used to sample run history (7 or 30)") + cmd.Flags().String("period", "month", "Aggregation period for projections: week or month") + cmd.Flags().Int("sample", 100, "Maximum number of completed runs to sample per workflow") + cmd.Flags().Bool("eval", false, "Evaluate forecast quality against past data (backtesting mode)") + addRepoFlag(cmd) + addJSONFlag(cmd) + + cmd.ValidArgsFunction = CompleteWorkflowNames + + return cmd +} diff --git a/pkg/cli/forecast_montecarlo.go b/pkg/cli/forecast_montecarlo.go new file mode 100644 index 00000000000..c4563fddd75 --- /dev/null +++ b/pkg/cli/forecast_montecarlo.go @@ -0,0 +1,271 @@ +package cli + +// This file implements a Monte Carlo simulation engine for the forecast command. +// It models three independent sources of uncertainty: +// +// 1. Run-count uncertainty — the number of workflow executions in a future period +// follows a Poisson process. The arrival rate λ is itself uncertain (estimated +// from a finite history window), so each trial draws λ from its Bayesian posterior +// Gamma(n+0.5, scale=observedRunsPerPeriod/n), where n is the observed run count +// and 0.5 is the Jeffreys non-informative prior shape. This Gamma–Poisson +// (Negative Binomial) compound model naturally produces wider confidence intervals +// when data are sparse and converges to the classical Poisson estimate as n grows. +// 2. Per-run token usage variability — effective tokens per run are drawn via +// bootstrap resampling from the historical observations, capturing the empirical +// distribution without assuming a parametric form. +// 3. Per-run success uncertainty — each run independently succeeds with probability +// equal to the historical success rate (Bernoulli model). +// +// Running 10 000 trials and reporting P10/P50/P90 gives conservative and optimistic +// estimates alongside the median, which is more informative than a single point +// estimate for capacity planning. + +import ( + "math" + "math/rand" + "sort" +) + +// monteCarloIterations is the number of simulation trials per workflow. +// 10 000 gives < 1% Monte Carlo error on percentile estimates and runs in < 10 ms +// for typical sample sizes. +const monteCarloIterations = 10_000 + +// minObservationsForReliableForecast is the minimum number of completed run +// observations required for confidence intervals to be considered statistically +// meaningful. Forecasts based on fewer observations are returned but flagged +// IsReliable = false. +const minObservationsForReliableForecast = 5 + +// ForecastMonteCarloSummary contains the probability distribution of projected +// effective-token counts derived from a Monte Carlo simulation. +// +// The simulation models run-count uncertainty via a Gamma–Poisson (Negative +// Binomial) compound process, per-run token usage via bootstrap resampling of +// historical observations, and per-run success probability via a Bernoulli draw. +// Percentile estimates (P10/P50/P90) give optimistic, median, and conservative +// bounds for the forecast period. +type ForecastMonteCarloSummary struct { + // Iterations is the number of simulation trials that were run. + Iterations int `json:"iterations"` + // MeanProjectedEffectiveTokens is the arithmetic mean of simulated ET totals across all trials. + MeanProjectedEffectiveTokens int `json:"mean_projected_effective_tokens"` + // StdDevEffectiveTokens is the standard deviation of simulated ET totals (spread of distribution). + StdDevEffectiveTokens float64 `json:"std_dev_effective_tokens"` + // P10ProjectedEffectiveTokens is the 10th-percentile ET count — only 10% of simulated outcomes + // fall below this value (optimistic bound). + P10ProjectedEffectiveTokens int `json:"p10_projected_effective_tokens"` + // P50ProjectedEffectiveTokens is the median simulated ET count. + P50ProjectedEffectiveTokens int `json:"p50_projected_effective_tokens"` + // P90ProjectedEffectiveTokens is the 90th-percentile ET count — 90% of simulated outcomes fall + // below this value (conservative / budget bound). + P90ProjectedEffectiveTokens int `json:"p90_projected_effective_tokens"` + // IsReliable is true when the simulation was based on at least minObservationsForReliableForecast + // completed runs. When false the confidence intervals may be very wide or unreliable. + IsReliable bool `json:"is_reliable"` +} + +// runMonteCarlo runs a Monte Carlo simulation to estimate the probability distribution +// of projected effective-token usage over the forecast period. +// +// Parameters: +// - etObservations: per-run effective-token counts from historical completed runs. +// - successCount: number of those runs that concluded "success". +// - observedRunsPerPeriod: point estimate of expected runs in the projection period. +// - rng: caller-supplied random number generator (allows deterministic testing). +// +// The run-count rate λ is treated as uncertain and drawn each trial from its +// Bayesian posterior Gamma(n+0.5, scale=observedRunsPerPeriod/n), where n is the +// number of historical observations and 0.5 is the Jeffreys non-informative prior +// shape. This compound Gamma–Poisson model is equivalent to a Negative Binomial +// and naturally produces wider confidence intervals for small samples, converging to +// the classical Poisson(observedRunsPerPeriod) model as n → ∞. +// +// Returns nil when etObservations is empty or observedRunsPerPeriod ≤ 0. +func runMonteCarlo(etObservations []int, successCount int, observedRunsPerPeriod float64, rng *rand.Rand) *ForecastMonteCarloSummary { + n := len(etObservations) + if n == 0 || observedRunsPerPeriod <= 0 { + return nil + } + + successRate := float64(successCount) / float64(n) + + // Bayesian posterior parameters for the Poisson arrival rate λ. + // Prior: Jeffreys improper prior ∝ 1/√λ — equivalent to Gamma(0.5, ∞). + // Likelihood: observedCount ~ Poisson(λ × historyWindow). + // Posterior: λ_period | n ~ Gamma(shape=n+0.5, scale=observedRunsPerPeriod/n). + // Mean of this Gamma = (n+0.5)/n × observedRunsPerPeriod ≈ observedRunsPerPeriod. + gammaShape := float64(n) + 0.5 + gammaScale := observedRunsPerPeriod / float64(n) + + simETs := make([]int, monteCarloIterations) + + for i := 0; i < monteCarloIterations; i++ { + // Draw run-count rate from posterior Gamma (accounts for estimation uncertainty in λ). + lambdaTrial := gammaSample(rng, gammaShape) * gammaScale + // Draw number of runs from Poisson(λ_trial). + numRuns := poissonSample(rng, lambdaTrial) + + var totalET int + for j := 0; j < numRuns; j++ { + // Each run succeeds independently with probability successRate. + if rng.Float64() >= successRate { + continue + } + // Bootstrap: sample ET from the empirical distribution. + totalET += etObservations[rng.Intn(n)] + } + + simETs[i] = totalET + } + + // Sort for percentile computation. + sort.Ints(simETs) + + mean, stddev := meanStdDevInt(simETs) + + return &ForecastMonteCarloSummary{ + Iterations: monteCarloIterations, + MeanProjectedEffectiveTokens: mean, + StdDevEffectiveTokens: stddev, + P10ProjectedEffectiveTokens: percentileInt(simETs, 10), + P50ProjectedEffectiveTokens: percentileInt(simETs, 50), + P90ProjectedEffectiveTokens: percentileInt(simETs, 90), + IsReliable: n >= minObservationsForReliableForecast, + } +} + +// poissonSample draws a random variate from Poisson(lambda). +// +// For lambda ≤ 15 it uses Knuth's multiplicative algorithm (exact, O(lambda) per sample). +// For lambda > 15 it uses a Normal approximation, which is accurate to +// within 0.3% for the tails that matter in forecasting contexts, and avoids +// the linear cost that becomes significant at 10 000 trials. +func poissonSample(rng *rand.Rand, lambda float64) int { + if lambda <= 0 { + return 0 + } + if lambda <= 15 { + // Knuth's algorithm: O(lambda) per sample, exact. + L := math.Exp(-lambda) + k := 0 + p := 1.0 + for { + k++ + p *= rng.Float64() + if p <= L { + break + } + } + return k - 1 + } + // Normal approximation: Poisson(λ) ≈ N(λ, √λ) for large λ. + v := lambda + math.Sqrt(lambda)*rng.NormFloat64() + if v < 0 { + return 0 + } + return int(math.Round(v)) +} + +// gammaSample draws a random variate from Gamma(shape, scale=1) using the +// Marsaglia-Tsang squeeze method for shape ≥ 1, and the reduction +// Gamma(shape) = Gamma(shape+1) × U^(1/shape) for 0 < shape < 1. +// +// References: Marsaglia & Tsang (2000), "A Simple Method for Generating Gamma Variables". +// +// shape ≤ 0 is a caller error; the function returns 0 as a defensive no-op +// consistent with poissonSample's treatment of lambda ≤ 0. All call sites in the +// simulation pass shape = n+0.5 (n ≥ 1), so this branch is never reached in +// practice. +func gammaSample(rng *rand.Rand, shape float64) float64 { + if shape <= 0 { + return 0 + } + if shape < 1 { + // Reduce to shape+1 via the identity X = Y × U^(1/shape). + return gammaSample(rng, shape+1) * math.Pow(rng.Float64(), 1.0/shape) + } + // Marsaglia-Tsang method for shape ≥ 1. + d := shape - 1.0/3.0 + c := 1.0 / math.Sqrt(9.0*d) + for { + var x, v float64 + for { + x = rng.NormFloat64() + v = 1.0 + c*x + if v > 0 { + break + } + } + v = v * v * v + u := rng.Float64() + xsq := x * x + // Fast acceptance (squeeze step). + if u < 1.0-0.0331*(xsq*xsq) { + return d * v + } + // Slower acceptance (log-space step). + if math.Log(u) < 0.5*xsq+d*(1.0-v+math.Log(v)) { + return d * v + } + } +} + +// meanStdDevInt computes the arithmetic mean and population standard deviation +// of the int slice xs (assumed non-empty). +// +// The mean is returned as an int (truncated toward zero after integer division), +// which is consistent with the ET token counts throughout the forecast output. +// The standard deviation uses the full floating-point mean to avoid accumulating +// rounding error in the variance calculation. +func meanStdDevInt(xs []int) (mean int, stddev float64) { + if len(xs) == 0 { + return 0, 0 + } + var sum int + for _, x := range xs { + sum += x + } + mean = sum / len(xs) + // Use the exact float mean for stddev to avoid bias from integer truncation. + fmean := float64(sum) / float64(len(xs)) + for _, x := range xs { + d := float64(x) - fmean + stddev += d * d + } + stddev = math.Sqrt(stddev / float64(len(xs))) + return +} + +// percentileFloat64 returns the p-th percentile of an already-sorted float64 slice +// using the nearest-rank method. p must be in [1, 100]. +func percentileFloat64(sorted []float64, p int) float64 { + if len(sorted) == 0 { + return 0 + } + idx := int(math.Ceil(float64(p)/100*float64(len(sorted)))) - 1 + if idx < 0 { + idx = 0 + } + if idx >= len(sorted) { + idx = len(sorted) - 1 + } + return sorted[idx] +} + +// percentileInt returns the p-th percentile of an already-sorted int slice +// using the nearest-rank method. p must be in [1, 100]. +func percentileInt(sorted []int, p int) int { + if len(sorted) == 0 { + return 0 + } + idx := int(math.Ceil(float64(p)/100*float64(len(sorted)))) - 1 + if idx < 0 { + idx = 0 + } + if idx >= len(sorted) { + idx = len(sorted) - 1 + } + return sorted[idx] +} + diff --git a/pkg/cli/forecast_montecarlo_test.go b/pkg/cli/forecast_montecarlo_test.go new file mode 100644 index 00000000000..3e6a12272f2 --- /dev/null +++ b/pkg/cli/forecast_montecarlo_test.go @@ -0,0 +1,331 @@ +//go:build !integration + +package cli + +import ( + "math" + "math/rand" + "sort" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// deterministicRNG returns a seeded *rand.Rand for reproducible test results. +func deterministicRNG() *rand.Rand { + return rand.New(rand.NewSource(42)) //nolint:gosec +} + +// TestPoissonSample verifies that the Poisson sampler produces an empirical mean +// and variance close to lambda (within statistical tolerance for 100 000 draws). +func TestPoissonSample(t *testing.T) { + rng := deterministicRNG() + const lambda = 10.0 // within Knuth's exact branch (≤15) + const n = 100_000 + + sum := 0.0 + sumSq := 0.0 + for i := 0; i < n; i++ { + v := float64(poissonSample(rng, lambda)) + sum += v + sumSq += v * v + } + mean := sum / n + variance := sumSq/n - mean*mean + + // Poisson(λ): mean == λ, variance == λ. Allow 1% relative error. + assert.InEpsilon(t, lambda, mean, 0.01, "empirical mean should be close to lambda") + assert.InEpsilon(t, lambda, variance, 0.01, "empirical variance should be close to lambda") +} + +// TestPoissonSampleLargeLambda exercises the normal-approximation branch (lambda > 15). +func TestPoissonSampleLargeLambda(t *testing.T) { + rng := deterministicRNG() + const lambda = 100.0 + const n = 100_000 + + sum := 0.0 + for i := 0; i < n; i++ { + sum += float64(poissonSample(rng, lambda)) + } + mean := sum / n + + assert.InEpsilon(t, lambda, mean, 0.01, "normal-approximation branch should produce correct mean") +} + +// TestPoissonSampleEdgeCases checks boundary conditions. +func TestPoissonSampleEdgeCases(t *testing.T) { + rng := deterministicRNG() + assert.Equal(t, 0, poissonSample(rng, 0), "lambda=0 should return 0") + assert.Equal(t, 0, poissonSample(rng, -5), "negative lambda should return 0") +} + +// TestPercentileFloat64 checks the nearest-rank percentile helper. +func TestPercentileFloat64(t *testing.T) { + sorted := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + assert.Equal(t, 1.0, percentileFloat64(sorted, 10), "P10") + assert.Equal(t, 5.0, percentileFloat64(sorted, 50), "P50") + assert.Equal(t, 9.0, percentileFloat64(sorted, 90), "P90") + assert.Equal(t, 0.0, percentileFloat64(nil, 50), "empty slice") +} + +// TestPercentileInt checks the int variant of the percentile helper. +func TestPercentileInt(t *testing.T) { + sorted := []int{10, 20, 30, 40, 50, 60, 70, 80, 90, 100} + assert.Equal(t, 10, percentileInt(sorted, 10), "P10") + assert.Equal(t, 50, percentileInt(sorted, 50), "P50") + assert.Equal(t, 90, percentileInt(sorted, 90), "P90") + assert.Equal(t, 0, percentileInt(nil, 50), "empty slice") +} + +// TestMeanStdDevInt verifies the mean/stddev helper on a known distribution. +func TestMeanStdDevInt(t *testing.T) { + // population stddev of {2,4,4,4,5,5,7,9} = 2, mean = 5. + xs := []int{2, 4, 4, 4, 5, 5, 7, 9} + mean, stddev := meanStdDevInt(xs) + assert.Equal(t, 5, mean, "mean") + assert.InDelta(t, 2.0, stddev, 0.001, "population stddev") + + m0, s0 := meanStdDevInt(nil) + assert.Equal(t, 0, m0) + assert.Equal(t, 0.0, s0) +} + +// TestRunMonteCarloNilOnEmpty verifies that runMonteCarlo returns nil for empty inputs. +func TestRunMonteCarloNilOnEmpty(t *testing.T) { + rng := deterministicRNG() + assert.Nil(t, runMonteCarlo(nil, 0, 10.0, rng), "nil observations") + assert.Nil(t, runMonteCarlo([]int{100, 200}, 2, 0.0, rng), "zero lambda") + assert.Nil(t, runMonteCarlo([]int{100, 200}, 2, -1.0, rng), "negative lambda") +} + +// TestRunMonteCarloBasicProperties checks that the Monte Carlo summary satisfies +// statistical invariants (P10 ≤ P50 ≤ P90, mean ≥ 0, stddev ≥ 0). +func TestRunMonteCarloBasicProperties(t *testing.T) { + rng := deterministicRNG() + // 20 historical runs, all successful, each using ~1 000 tokens. + etObs := make([]int, 20) + for i := range etObs { + etObs[i] = 900 + i*10 // 900–1090 + } + + mc := runMonteCarlo(etObs, len(etObs), 10.0, rng) + require.NotNil(t, mc) + + assert.Equal(t, monteCarloIterations, mc.Iterations) + assert.GreaterOrEqual(t, mc.MeanProjectedEffectiveTokens, 0) + assert.GreaterOrEqual(t, mc.StdDevEffectiveTokens, 0.0) + assert.LessOrEqual(t, mc.P10ProjectedEffectiveTokens, mc.P50ProjectedEffectiveTokens, "ET P10 ≤ P50") + assert.LessOrEqual(t, mc.P50ProjectedEffectiveTokens, mc.P90ProjectedEffectiveTokens, "ET P50 ≤ P90") +} + +// TestRunMonteCarloZeroSuccessRate verifies that a 0% success rate produces zero ET. +func TestRunMonteCarloZeroSuccessRate(t *testing.T) { + rng := deterministicRNG() + etObs := []int{1000, 2000, 3000} + // successCount = 0 → successRate = 0/3 = 0. + mc := runMonteCarlo(etObs, 0, 5.0, rng) + require.NotNil(t, mc) + assert.Equal(t, 0, mc.P50ProjectedEffectiveTokens, "zero success rate → zero ET") + assert.Equal(t, 0, mc.P90ProjectedEffectiveTokens, "zero success rate → zero ET P90") +} + +// TestRunMonteCarloOrderOfMagnitude checks that the simulation mean is within +// 20% of the deterministic point estimate. +func TestRunMonteCarloOrderOfMagnitude(t *testing.T) { + rng := deterministicRNG() + etObs := []int{10_000, 12_000, 11_000, 9_500, 10_500} + successCount := 5 + observedRunsPerPeriod := 20.0 + + mc := runMonteCarlo(etObs, successCount, observedRunsPerPeriod, rng) + require.NotNil(t, mc) + + // Deterministic point estimate (ET). + var totalET int + for _, et := range etObs { + totalET += et + } + avgET := totalET / len(etObs) + pointEstimate := int(math.Round(observedRunsPerPeriod * float64(avgET))) + + // Simulation mean should be within 20% of point estimate (with 100% success rate + // and Poisson lambda = 20, the spread should be small). + assert.InEpsilon(t, float64(pointEstimate), float64(mc.MeanProjectedEffectiveTokens), 0.20, + "simulation mean ET should be close to point estimate") + + // P50 should also be within 20%. + assert.InEpsilon(t, float64(pointEstimate), float64(mc.P50ProjectedEffectiveTokens), 0.20, + "simulation P50 ET should be close to point estimate") + + // Confidence interval must bracket the mean. + assert.LessOrEqual(t, mc.P10ProjectedEffectiveTokens, mc.MeanProjectedEffectiveTokens) + assert.GreaterOrEqual(t, mc.P90ProjectedEffectiveTokens, mc.MeanProjectedEffectiveTokens) +} + +// TestRunMonteCarloSortedOutputs verifies CI ordering holds across many random seeds. +func TestRunMonteCarloSortedOutputs(t *testing.T) { + etObs := []int{5_000, 7_000, 6_000, 4_500} + for seed := int64(0); seed < 5; seed++ { + rng := rand.New(rand.NewSource(seed)) //nolint:gosec + mc := runMonteCarlo(etObs, len(etObs), 12.0, rng) + require.NotNil(t, mc) + assert.LessOrEqual(t, mc.P10ProjectedEffectiveTokens, mc.P50ProjectedEffectiveTokens) + assert.LessOrEqual(t, mc.P50ProjectedEffectiveTokens, mc.P90ProjectedEffectiveTokens) + } +} + +// TestRunMonteCarloDistributionShape verifies that the ET distribution is roughly +// unimodal by checking that the mean lies between P10 and P90. +func TestRunMonteCarloDistributionShape(t *testing.T) { + rng := deterministicRNG() + etObs := make([]int, 50) + for i := range etObs { + etObs[i] = 8_000 + i*40 + } + mc := runMonteCarlo(etObs, len(etObs), 30.0, rng) + require.NotNil(t, mc) + + assert.GreaterOrEqual(t, mc.MeanProjectedEffectiveTokens, mc.P10ProjectedEffectiveTokens, "mean ≥ P10") + assert.LessOrEqual(t, mc.MeanProjectedEffectiveTokens, mc.P90ProjectedEffectiveTokens, "mean ≤ P90") +} + +// TestPercentileSingleElement ensures percentile works for a length-1 slice. +func TestPercentileSingleElement(t *testing.T) { + sorted := []float64{42.0} + assert.Equal(t, 42.0, percentileFloat64(sorted, 10)) + assert.Equal(t, 42.0, percentileFloat64(sorted, 90)) +} + +// TestGammaSampleMeanVariance verifies that gammaSample produces the expected mean +// (= shape) and variance (= shape) for a Gamma(shape, scale=1) distribution. +func TestGammaSampleMeanVariance(t *testing.T) { + rng := deterministicRNG() + const shape = 5.5 // typical value: n+0.5 for n=5 observed runs + const n = 200_000 + + var sum, sumSq float64 + for i := 0; i < n; i++ { + v := gammaSample(rng, shape) + sum += v + sumSq += v * v + } + mean := sum / n + variance := sumSq/n - mean*mean + + // Gamma(shape, scale=1): mean = shape, variance = shape. Allow 1% relative error. + assert.InEpsilon(t, shape, mean, 0.01, "gamma empirical mean should equal shape") + assert.InEpsilon(t, shape, variance, 0.01, "gamma empirical variance should equal shape") +} + +// TestGammaSampleSmallShape verifies the shape < 1 reduction path for multiple +// fractional shape values (0.3, 0.5, 0.8) to ensure the recursive identity +// Gamma(shape) = Gamma(shape+1) × U^(1/shape) is exercised correctly. +func TestGammaSampleSmallShape(t *testing.T) { + const n = 200_000 + for _, shape := range []float64{0.3, 0.5, 0.8} { + rng := deterministicRNG() + var sum float64 + for i := 0; i < n; i++ { + sum += gammaSample(rng, shape) + } + mean := sum / n + assert.InEpsilon(t, shape, mean, 0.01, + "gamma mean should equal shape for shape=%v", shape) + } +} + +// TestGammaSampleEdgeCases checks boundary and degenerate inputs. +func TestGammaSampleEdgeCases(t *testing.T) { + rng := deterministicRNG() + assert.Equal(t, 0.0, gammaSample(rng, 0), "shape=0 → 0") + assert.Equal(t, 0.0, gammaSample(rng, -1), "shape<0 → 0") +} + +// TestRunMonteCarloIsReliable verifies that IsReliable reflects the minimum +// observation threshold. +func TestRunMonteCarloIsReliable(t *testing.T) { + rng := deterministicRNG() + + // Below threshold: 3 observations < minObservationsForReliableForecast (5). + smallObs := []int{1000, 1500, 1200} + mcSmall := runMonteCarlo(smallObs, len(smallObs), 4.0, rng) + require.NotNil(t, mcSmall) + assert.False(t, mcSmall.IsReliable, "fewer than 5 observations → IsReliable=false") + + // At threshold: exactly minObservationsForReliableForecast observations. + atThreshold := []int{1000, 1100, 1200, 1300, 1400} + mcAt := runMonteCarlo(atThreshold, len(atThreshold), 4.0, rng) + require.NotNil(t, mcAt) + assert.True(t, mcAt.IsReliable, "exactly 5 observations → IsReliable=true") + + // Well above threshold. + largeObs := make([]int, 20) + for i := range largeObs { + largeObs[i] = 1000 + i*50 + } + mcLarge := runMonteCarlo(largeObs, len(largeObs), 10.0, rng) + require.NotNil(t, mcLarge) + assert.True(t, mcLarge.IsReliable, "20 observations → IsReliable=true") +} + +// TestRunMonteCarloGammaPoissonWiderCI verifies that the Gamma–Poisson compound model +// produces wider confidence intervals for small samples compared to a scenario where +// the rate is well-estimated (large sample). With small n the posterior Gamma has +// higher relative variance, so the simulated ET distribution should be broader. +func TestRunMonteCarloGammaPoissonWiderCI(t *testing.T) { + // Same observed rate (λ = 10) but different sample sizes. + etVal := 1_000 // constant ET to isolate run-count variability + const lambda = 10.0 + + // Small sample: 3 runs observed → high relative uncertainty in λ. + smallObs := []int{etVal, etVal, etVal} + rngSmall := rand.New(rand.NewSource(7)) //nolint:gosec + mcSmall := runMonteCarlo(smallObs, len(smallObs), lambda, rngSmall) + require.NotNil(t, mcSmall) + + // Large sample: 100 runs observed → low relative uncertainty in λ. + largeObs := make([]int, 100) + for i := range largeObs { + largeObs[i] = etVal + } + rngLarge := rand.New(rand.NewSource(7)) //nolint:gosec + mcLarge := runMonteCarlo(largeObs, len(largeObs), lambda, rngLarge) + require.NotNil(t, mcLarge) + + ciSmall := mcSmall.P90ProjectedEffectiveTokens - mcSmall.P10ProjectedEffectiveTokens + ciLarge := mcLarge.P90ProjectedEffectiveTokens - mcLarge.P10ProjectedEffectiveTokens + + assert.Greater(t, ciSmall, ciLarge, + "small-sample CI (P90-P10=%d) should be wider than large-sample CI (%d)", ciSmall, ciLarge) +} + +// TestRunMonteCarloFullEpisodePath is a smoke test that exercises runMonteCarlo +// with a realistic setup and validates ET percentile ordering. +func TestRunMonteCarloFullEpisodePath(t *testing.T) { + rng := deterministicRNG() + + // Simulate 30 completed runs with varied token counts. + etObs := make([]int, 30) + successCount := 0 + for i := range etObs { + etObs[i] = 5_000 + i*200 + if i%5 != 0 { // 4 out of every 5 runs succeed → 80% success rate + successCount++ + } + } + + mc := runMonteCarlo(etObs, successCount, 8.0, rng) + require.NotNil(t, mc) + assert.Equal(t, monteCarloIterations, mc.Iterations) + assert.Greater(t, mc.P90ProjectedEffectiveTokens, mc.P10ProjectedEffectiveTokens, "P90 > P10 for non-trivial inputs") + + // ET percentiles should already be in ascending order. + ets := []int{mc.P10ProjectedEffectiveTokens, mc.P50ProjectedEffectiveTokens, mc.P90ProjectedEffectiveTokens} + sorted := make([]int, len(ets)) + copy(sorted, ets) + sort.Ints(sorted) + assert.Equal(t, ets, sorted, "ET percentiles should already be in ascending order") +} + diff --git a/pkg/cli/forecast_test.go b/pkg/cli/forecast_test.go new file mode 100644 index 00000000000..7050055f2c2 --- /dev/null +++ b/pkg/cli/forecast_test.go @@ -0,0 +1,104 @@ +//go:build !integration + +package cli + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// ── formatForecastPercent ──────────────────────────────────────────────────── + +func TestFormatForecastPercent_NoData(t *testing.T) { + assert.Equal(t, "N/A", formatForecastPercent(0, false), "no data → N/A") +} + +func TestFormatForecastPercent_ZeroPercent(t *testing.T) { + // A legitimate 0% success rate (all runs failed) must NOT return N/A. + assert.Equal(t, "0%", formatForecastPercent(0, true), "0% with data → '0%'") +} + +func TestFormatForecastPercent_NonZero(t *testing.T) { + assert.Equal(t, "92%", formatForecastPercent(0.923, true)) +} + +func TestFormatForecastPercent_OneHundred(t *testing.T) { + assert.Equal(t, "100%", formatForecastPercent(1.0, true)) +} + +// ── formatForecastTokens ───────────────────────────────────────────────────── + +func TestFormatForecastTokens_Zero(t *testing.T) { + assert.Equal(t, "-", formatForecastTokens(0)) +} + +func TestFormatForecastTokens_SmallInt(t *testing.T) { + assert.Equal(t, "500", formatForecastTokens(500)) +} + +func TestFormatForecastTokens_Kilo(t *testing.T) { + assert.Equal(t, "12.5K", formatForecastTokens(12500)) +} + +func TestFormatForecastTokens_Mega(t *testing.T) { + assert.Equal(t, "1.20M", formatForecastTokens(1_200_000)) +} + +// ── extractWorkflowIDFromName ───────────────────────────────────────────────── + +func TestExtractWorkflowIDFromName(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"ci-doctor", "ci-doctor"}, + {"ci-doctor.lock.yml", "ci-doctor"}, + {"ci-doctor.yml", "ci-doctor"}, + {"foo.yaml", "foo"}, + {"daily-planner.lock.yml", "daily-planner"}, + } + for _, tc := range cases { + assert.Equal(t, tc.want, extractWorkflowIDFromName(tc.in), "input=%q", tc.in) + } +} + +// ── RunForecast validation ──────────────────────────────────────────────────── + +func TestRunForecast_InvalidPeriod(t *testing.T) { + cfg := ForecastConfig{Days: 30, Period: "quarter", SampleSize: 10} + err := RunForecast(cfg) + require.Error(t, err, "should error for invalid period") +} + +func TestRunForecast_InvalidDays(t *testing.T) { + cfg := ForecastConfig{Days: 90, Period: "month", SampleSize: 10} + err := RunForecast(cfg) + require.Error(t, err, "should error for days=90 (max is 30)") +} + +// ── Duration enrichment ─────────────────────────────────────────────────────── + +// TestDurationEnrichment verifies that the forecast loop computes Duration from +// StartedAt/UpdatedAt when the Duration field is zero (as returned by gh run list). +func TestDurationEnrichment(t *testing.T) { + start := time.Date(2026, 1, 1, 10, 0, 0, 0, time.UTC) + end := start.Add(5 * time.Minute) + + r := WorkflowRun{ + Status: "completed", + Conclusion: "success", + StartedAt: start, + UpdatedAt: end, + // Duration is intentionally zero (not populated by gh run list) + } + + // Simulate the enrichment logic from forecastWorkflow. + if r.Duration == 0 && !r.StartedAt.IsZero() && !r.UpdatedAt.IsZero() { + r.Duration = r.UpdatedAt.Sub(r.StartedAt) + } + + assert.Equal(t, 5*time.Minute, r.Duration) +}