Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
eda279f
Add live golden-path (Tier 2) pipeline for azd ai agent extension
Jun 22, 2026
b877e6e
Fix live pipeline auth: run test/cleanup inside AzureCLI@2 with keepA…
Jun 22, 2026
9c906b1
Clean up Tier 2 driver: drop dead parallel-mode code, fix doc ref
Jun 22, 2026
f7d9c98
Address Copilot review on live Tier 2 pipeline
Jun 22, 2026
2518d8f
Address second Copilot review pass on live Tier 2 pipeline
Jun 22, 2026
08a4aef
Address third Copilot review pass: shell-quoting + tmux cleanup
Jun 22, 2026
7890eec
Harden E2E shell composition and rm -rf guardrail (Copilot round 4)
Jun 22, 2026
a2f634d
Clarify Linux/WSL support in E2E docstrings (Copilot round 5)
Jun 22, 2026
ef890ee
Bound setup() subprocess calls with timeout/check (Copilot round 6)
Jun 22, 2026
f2e67c2
Harden invoke assertion to a standalone 4/four token + Linux/WSL docs…
Jun 22, 2026
f6339db
Export GH token only once in setup(); raise instead of assert for std…
Jun 22, 2026
38310d0
Normalize TESTDIR to a vetted abspath; size watchdog to child phase-s…
Jun 22, 2026
729c744
Fail run on teardown failure; spawn child via sys.executable; use pin…
Jun 22, 2026
be2ddd1
Use backtick command substitution for uname so ADO can't read it as a…
Jun 22, 2026
db53b06
Give child watchdog a teardown margin over phase-sum; widen pipeline …
Jun 23, 2026
019d402
Drop redundant f-prefix on two placeholder-less log strings
Jun 23, 2026
171d7e6
Correct documented tmux minimum to match the ubuntu-22.04 CI agent
Jun 23, 2026
c62a696
Rewrite Tier 2 live E2E driver from tmux+Python to Go
Jun 24, 2026
4d27988
Make Tier 2 live init driver event-driven; calibrate prompts to wizar…
Jun 24, 2026
e17c88e
Address review: use errors.AsType and assert intentional 4.0 rejection
Jun 25, 2026
fa9b60d
Merge remote-tracking branch 'origin/main' into live-8758-fix
Jun 25, 2026
bd91674
Surface crash-path teardown errors in pipeline log
Jun 25, 2026
d6170cb
Parse azure.yaml via yaml.Unmarshal in findServiceName
Jun 25, 2026
9a1584c
Use t.Context() as the test parent context in Tier 2 runner
Jun 25, 2026
c23f900
Log unhandled prompts that fall through to the default dispatch case
Jun 26, 2026
58f180e
Merge remote-tracking branch 'origin/main' into live-8758-fix
Jun 26, 2026
ab890b1
Tighten interactive prompt dispatch matches to full preamble strings
Jun 26, 2026
4906d71
Reference PromptAiDeployment by name instead of stale line numbers
Jun 26, 2026
cbdf5a5
Fix subscription dispatch to match azd-core's "Select subscription"
Jun 26, 2026
3c0f717
Scope 2+2 invoke check to the agent response region
Jun 29, 2026
517544a
Generate live-pipeline config.json from extension.yaml via yq
Jun 29, 2026
2d9dfb7
Add region test for standalone 4 before agent reply
Jun 29, 2026
6c66c25
Merge origin/main; tag resource groups with DeleteAfter for EngSys cl…
Jun 30, 2026
30dfb29
fix: modernize strings.Split to SplitSeq for go fix lint
Jun 30, 2026
e1c7ab8
fix: tag cleanup resource groups by azd env name, not env values
Jun 30, 2026
990ffa5
build: bump creack/pty test dep to v1.1.24
Jun 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cli/azd/extensions/azure.ai.agents/cspell.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ words:
- aoai
- authorizationfailed
- azdaiagent
- azuresdk
- bicepless
- CLIENTSECRET
- curr
Expand Down Expand Up @@ -89,5 +90,9 @@ words:
- ttfb
- Bhadauria
- ensurepip
# Live E2E (Tier 2) Go driver
- creack
- elive
- testdir
# Test infrastructure
- recordproxy
3 changes: 3 additions & 0 deletions cli/azd/extensions/azure.ai.agents/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ require (
require github.com/denormal/go-gitignore v0.0.0-20180930084346-ae8ad1d07817

require (
github.com/Netflix/go-expect v0.0.0-20220104043353-73e0943537d2
Comment thread
v1212 marked this conversation as resolved.
github.com/creack/pty v1.1.24
github.com/hinshun/vt10x v0.0.0-20220119200601-820417d04eec
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/trace v1.43.0
golang.org/x/term v0.44.0
Expand Down
3 changes: 2 additions & 1 deletion cli/azd/extensions/azure.ai.agents/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,9 @@ github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEX
github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.17 h1:QeVUsEDNrLBW4tMgZHvxy18sKtr6VI492kBhUfhDJNI=
github.com/creack/pty v1.1.17/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4=
github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE=
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 h1:y5HC9v93H5EPKqaS1UYVg1uYah5Xf51mBfIoWehClUQ=
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964/go.mod h1:Xd9hchkHSWYkEqJwUGisez3G1QY8Ryz0sdWrLPMGjLk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down
147 changes: 147 additions & 0 deletions cli/azd/extensions/azure.ai.agents/tests/e2e-live/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# azure.ai.agents — Live E2E (Tier 2)

Full golden-path tests that exercise the real `azd ai agent` CLI against **live
Azure** resources:

```
init → provision → deploy → invoke → down
```

A Go test driver answers the interactive `azd ai agent init` prompts through a
**pseudo-terminal** — [go-expect] sends keystrokes and [vt10x] renders the CLI's
terminal UI so the test can assert on the on-screen text, with [creack/pty]
providing the PTY. Synchronization is **event-driven**: the driver blocks on
go-expect reads until the survey UI stops emitting — i.e. a prompt is fully
drawn and waiting for input — instead of sleeping a fixed interval, then
dispatches on the rendered prompt text. The deploy mode is chosen up front via
`azd ai agent init --deploy-mode code|container` (it is not an interactive
prompt once a manifest is supplied). The non-interactive phases (`provision`,
`deploy`, `invoke`, `down`) shell out to `azd ... --no-prompt`. Both deploy
modes are covered:

| Mode | What it does |
| ----------- | ------------------------------------------------------- |
| `code` | Source-code (zip) deploy of the agent service |
| `container` | Container (ACR build) deploy of the agent service |
Comment thread
v1212 marked this conversation as resolved.

The two modes run **sequentially** (same subscription → avoids resource races).

[go-expect]: https://github.com/Netflix/go-expect
[vt10x]: https://github.com/hinshun/vt10x
[creack/pty]: https://github.com/creack/pty

## How the `init` driver answers prompts

The interactive sub-flows (Foundry project selection, model/deployment) branch
on live runtime state, so the exact set and order of prompts is not fixed ahead
of time. Rather than a linear expect script, the driver runs a **dispatch
loop**: it waits for output to settle, reads the rendered screen, matches the
active `?` prompt against the verbatim strings the extension prints — each case
in `dispatchPrompt` is annotated with the source `file:line` it mirrors — and
sends the answer. A loop detector bounds any prompt that fails to advance so a
wording change upstream fails fast instead of hanging.

Because the prompt strings are calibrated against the extension source, changes
there can require updating `dispatchPrompt`. And because a real PTY, Azure auth,
and the installed extension are all required, the **end-to-end interactive
correctness is only exercised by a live Tier 2 run** — it cannot be reproduced
by the platform-agnostic unit tests in this package.

## Where this fits

| Tier | Coverage | Where it runs |
| ---- | ----------------------------------------- | ------------------------------------------------------ |
| 0 | Offline CLI validation (no auth) | PR gate — `.github/workflows/lint-ext-azure-ai-agents.yml` |
| 1 | `init` variants (recording/playback) | PR gate — same workflow |
| 2 | **Full live golden path** (this folder) | **`eng/pipelines/ext-azure-ai-agents-live.yml`** |
Comment thread
v1212 marked this conversation as resolved.

Live Azure access is deliberately kept **out** of the automatic PR pipeline (Azure
SDK EngSys / SFI guidance). Tier 2 runs only on demand or on a schedule.

## Running in CI

Pipeline: `eng/pipelines/ext-azure-ai-agents-live.yml` (ADO). The Tier 2 step
builds `azd` + the extension and runs `go test -run TestTier2Live` inside an
`AzureCLI@2` task (so the federated az session stays valid for the whole run).

- **On demand (per PR):** comment `/azp run ext-azure-ai-agents-live` on the PR.
Requires write permission on the repo.
- **Scheduled:** weekly, Monday 07:00 UTC against `main`.
- **Manual:** queue the pipeline and pick `deployModes` = `both` / `code` /
`container`.

Logs for each run are published as the `tier2-live-logs-<BuildId>` artifact.

### One-time admin setup

1. **Register the pipeline** in Azure DevOps pointing at
`eng/pipelines/ext-azure-ai-agents-live.yml`, named `ext-azure-ai-agents-live`
(the name used by `/azp run`).
2. **Service connection** — the `serviceConnection` parameter (default
`azure-sdk-tests`) must map to the shared **TME test subscription** via OIDC /
workload-identity federation. The federated identity needs enough RBAC to
create Foundry projects and deploy models (Contributor + Azure AI Developer +
Cognitive Services Contributor, or equivalent).
3. **GitHub auth** — clones of the starter template use the azure-sdk org secret
`azuresdk-github-pat` (already provided by the Azure SDK ADO project) to avoid
anonymous rate limits, so no extra secret setup is required.

## Running locally (Linux / WSL)

The live driver is tagged `//go:build linux` — it relies on a real PTY and a
controlling terminal (the platform CI runs on). On Windows, run it under WSL.

Prerequisites: Linux (including WSL), a Go toolchain matching `go.mod`
(`GOTOOLCHAIN=auto` fetches the right version automatically), `azd` (>= 1.25.5)
with the `azure.ai.agents` extension installed, and `az` logged in.

Run from the extension root (`cli/azd/extensions/azure.ai.agents`):

```bash
# Use azd's built-in auth locally (NOT az CLI auth — it is slow under WSL).
azd config unset auth.useAzCliAuth
azd auth login

# Both modes (sequential):
AZURE_AI_AGENTS_E2E_LIVE=1 E2E_DEPLOY_MODES=both \
go test -run TestTier2Live -count=1 -timeout 130m -v ./tests/e2e-live/

# A single golden path:
AZURE_AI_AGENTS_E2E_LIVE=1 E2E_DEPLOY_MODES=code \
go test -run TestTier2Live -count=1 -timeout 90m -v ./tests/e2e-live/
```

Without `AZURE_AI_AGENTS_E2E_LIVE=1` the test is **skipped**, so the package is
safe to include in a normal `go test ./...`.

### Useful environment variables

| Variable | Default | Purpose |
| -------------------------- | ------------------------------ | ----------------------------------------------------------- |
| `AZURE_AI_AGENTS_E2E_LIVE` | — | **Required** `=1` gate; unset → the test is skipped |
| `E2E_DEPLOY_MODES` | `both` | `both` / `code` / `container` |
| `E2E_CREATE_PROJECT` | `false` | `true` → always create a fresh Foundry project |
| `E2E_PROJECT` | — | Name of an existing Foundry project to select instead |
| `E2E_LOCATION` | `eastus2` | Region for new projects (needs model quota) |
| `E2E_SUBSCRIPTION` | — | Subscription id (filters the picker) |
| `E2E_TENANT` | — | AAD tenant id (sets `AZURE_TENANT_ID` for azd) |
| `E2E_USE_AZ_CLI_AUTH` | — | `true` → set `auth.useAzCliAuth` (CI; auto-on under ADO/GHA) |
| `E2E_TESTDIR` | `/tmp/e2e-tests/tier2-<mode>` | Scratch dir for the scaffolded project |
| `E2E_KEEP_ARTIFACTS` | — | `true` → keep the per-run `AZD_CONFIG_DIR` copy for debugging |
| `GH_TOKEN` | — | GitHub token for template clone (optional) |

In CI the driver auto-detects GitHub Actions (`GITHUB_ACTIONS`) and Azure DevOps
(`TF_BUILD`) and switches to `az` CLI auth automatically. Azure resources are
always torn down (`azd down --force --purge`) via `t.Cleanup`, even on failure.

## Files

| File | Purpose |
| -------------------- | -------------------------------------------------------------------------------- |
| `tier2_live_test.go` | `TestTier2Live` — drives init/provision/deploy/invoke/down per mode (Linux-only) |
| `console_test.go` | PTY + vt10x console helper that renders the interactive CLI (Linux-only) |
| `assert.go` | Pure-logic answer matcher (`responseHasExpectedAnswer`) — builds on any platform |
| `assert_test.go` | Unit tests for the matcher — run anywhere via `go test ./tests/e2e-live/` |

Each phase has bounded timeouts and best-effort `azd down --force --purge`
teardown so a crash mid-run does not leak billable resources.
100 changes: 100 additions & 0 deletions cli/azd/extensions/azure.ai.agents/tests/e2e-live/assert.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// Package e2elive contains the Tier 2 live golden-path end-to-end test for the
// azure.ai.agents extension: init -> provision -> deploy -> invoke -> down,
// driven against real Azure resources. See README.md for setup and how to run.
package e2elive

import (
"regexp"
"unicode"
)

// spelledFourRe matches the spelled-out word "four" as a standalone word
// (case-insensitive), e.g. "the answer is four".
var spelledFourRe = regexp.MustCompile(`(?i)\bfour\b`)

// agentLineRe matches the start of an agent reply line, which invoke prints as
// "[<agentName>] <text>" (invoke.go printf "[%s] %s"). responseEndRe matches the
// green footer invoke prints after the reply, "Server responded in ..." — the
// region between them is exactly the model's answer, with no surrounding noise.
var (
agentLineRe = regexp.MustCompile(`(?m)^\[[^\]]+\] `)
responseEndRe = regexp.MustCompile(`Server responded in`)
)

// agentResponseRegion returns just the agent's printed answer, sliced from the
// first "[<agent>] " line to the "Server responded in" footer. Scoping the
// 2+2 check to this region keeps stray "4"s from the rest of the CLI output
// (model names, versions, status codes) from passing the test. If either marker
// is missing the format changed, so it returns the full text and lets the
// standalone-digit rules below guard against false positives.
func agentResponseRegion(out string) string {
start := agentLineRe.FindStringIndex(out)
if start == nil {
return out
}
rest := out[start[0]:]
if end := responseEndRe.FindStringIndex(rest); end != nil {
return rest[:end[0]]
}
return out
}

// responseHasExpectedAnswer reports whether text answers "what is 2+2?" with a
// standalone "4" or the spelled-out word "four".
//
// A live model may answer either, and the captured CLI output also contains
// unrelated digits — model names ("gpt-4o-mini"), versions ("4.1"), or status
// codes ("404") — so a bare substring search would produce false positives.
// The "4" must therefore stand alone: not part of a larger word or number.
// The standalone-"4" rule is the lookaround (?<![\w.])4(?!\.\d)(?!\w); the
// spelled-out "four" is matched case-insensitively as a whole word.
//
// A decimal such as "4.0" is deliberately rejected too: although 4.0 == 4
// mathematically, the "4.<digit>" form is treated as a version/decimal token to
// keep "4.1"-style strings out, and a live model answering "2+2" replies "4" or
// "four", never "4.0".
//
// Go's regexp engine (RE2) has no lookahead/lookbehind, so the standalone-"4"
// rule is implemented by scanning runes instead of with a single expression.
func responseHasExpectedAnswer(text string) bool {
if spelledFourRe.MatchString(text) {
return true
}
return hasStandaloneFour(text)
}

// hasStandaloneFour reports whether text contains a "4" digit that stands alone,
// reproducing the lookaround in the Python regex (?<![\w.])4(?!\.\d)(?!\w):
// - not preceded by a word rune or '.' (rejects "x4", "_4", ".4")
// - not followed by '.' then a digit (rejects "4.1", "4.0")
// - not followed by a word rune (rejects "40", "4o")
func hasStandaloneFour(text string) bool {
runes := []rune(text)
for i, r := range runes {
if r != '4' {
continue
}
if i > 0 {
if prev := runes[i-1]; prev == '.' || isWordRune(prev) {
continue
}
}
if i+2 < len(runes) && runes[i+1] == '.' && unicode.IsDigit(runes[i+2]) {
continue
}
if i+1 < len(runes) && isWordRune(runes[i+1]) {
continue
}
return true
}
return false
}

// isWordRune reports whether r is a word character, matching the Python regex
// \w class (Unicode letters, digits, and underscore).
func isWordRune(r rune) bool {
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

package e2elive

import "testing"

func TestResponseHasExpectedAnswer(t *testing.T) {
t.Parallel()

cases := []struct {
name string
text string
want bool
}{
{"plain four digit", "The answer is 4.", true},
{"bare four", "4", true},
{"equation", "2+2=4", true},
{"spelled word", "It is four.", true},
{"spelled upper", "FOUR", true},
{"parenthesized", "(4)", true},
{"trailing period mid-sentence", "the value 4. is final", true},
{"model name", "gpt-4o-mini", false},
{"version", "4.1", false},
{"decimal four", "4.0", false}, // intentional: see responseHasExpectedAnswer doc
{"status code", "404", false},
{"price", "$40", false},
{"ratio", "24/7", false},
{"fourteen", "fourteen apples", false},
{"no answer", "I am not sure", false},
{"empty", "", false},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
if got := responseHasExpectedAnswer(tc.text); got != tc.want {
t.Errorf("responseHasExpectedAnswer(%q) = %v, want %v", tc.text, got, tc.want)
}
})
}
}

func TestAgentResponseRegion(t *testing.T) {
t.Parallel()

cases := []struct {
name string
out string
want bool // responseHasExpectedAnswer over the sliced region
}{
{
"answer scoped between markers",
"using model gpt-4o-mini\n[agent] The answer is 4.\nServer responded in 2s (first byte: 1s)\n",
true,
},
{
"stray digits outside region rejected",
"gpt-4o-mini deployed (404 cached)\n[agent] I am not sure.\nServer responded in 4.0s\n",
false,
},
{
"standalone 4 before agent line excluded by region",
"completed step 4\n[agent] I don't know.\nServer responded in 1s\n",
false,
},
{
"missing footer falls back to full text",
"using gpt-4o-mini\n[agent] four",
true,
},
{
"no agent line falls back to full text",
"the answer is four",
true,
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
if got := responseHasExpectedAnswer(agentResponseRegion(tc.out)); got != tc.want {
t.Errorf("region(%q) -> %v, want %v", tc.out, got, tc.want)
}
Comment thread
v1212 marked this conversation as resolved.
Comment thread
v1212 marked this conversation as resolved.
})
}
}
Loading
Loading