Skip to content

Commit 8d32b7a

Browse files
authored
Merge pull request #146 from agent-diff-bench/conformance-tests
Unified conformance tests across all 4 services with real API validation
2 parents 7c67072 + 6d608ec commit 8d32b7a

8 files changed

Lines changed: 1653 additions & 23 deletions

backend/pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,8 @@ dependencies = [
2727

2828
[tool.pytest.ini_options]
2929
addopts = ["--tb=short"]
30+
markers = [
31+
"conformance: API conformance/parity tests against production APIs",
32+
"external: requires live API credentials (tokens/keys)",
33+
"replica_only: tests against replica only (no external credentials needed)",
34+
]

backend/tests/integration/test_slack_api_docs.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,3 +357,144 @@ async def test_search_messages_doc_shape(self, slack_client: AsyncClient) -> Non
357357
}
358358
assert expected_match_keys <= match.keys()
359359
assert HIGHLIGHT_START in match["text"] and HIGHLIGHT_END in match["text"]
360+
361+
async def test_auth_test_doc_shape(self, slack_client: AsyncClient) -> None:
362+
resp = await slack_client.post("/auth.test", json={})
363+
assert resp.status_code == 200
364+
data = resp.json()
365+
assert data["ok"] is True
366+
assert {"user_id", "user", "team_id", "team"} <= data.keys()
367+
assert data["user_id"] == USER_AGENT
368+
369+
async def test_chat_update_doc_shape(self, slack_client: AsyncClient) -> None:
370+
post_resp = await slack_client.post(
371+
"/chat.postMessage",
372+
json={"channel": CHANNEL_GENERAL, "text": "Original text for update"},
373+
)
374+
assert post_resp.status_code == 200
375+
ts = post_resp.json()["ts"]
376+
377+
resp = await slack_client.post(
378+
"/chat.update",
379+
json={"channel": CHANNEL_GENERAL, "ts": ts, "text": "Updated text"},
380+
)
381+
assert resp.status_code == 200
382+
data = resp.json()
383+
assert data["ok"] is True
384+
assert {"ok", "channel", "ts", "text"} <= data.keys()
385+
assert data["text"] == "Updated text"
386+
387+
async def test_conversations_archive_doc_shape(
388+
self, slack_client: AsyncClient
389+
) -> None:
390+
channel_name = _unique_name("doc-archive")
391+
create_resp = await slack_client.post(
392+
"/conversations.create", json={"name": channel_name, "is_private": False}
393+
)
394+
assert create_resp.status_code == 200
395+
channel_id = create_resp.json()["channel"]["id"]
396+
397+
resp = await slack_client.post(
398+
"/conversations.archive", json={"channel": channel_id}
399+
)
400+
assert resp.status_code == 200
401+
data = resp.json()
402+
assert data["ok"] is True
403+
404+
async def test_conversations_unarchive_doc_shape(
405+
self, slack_client: AsyncClient
406+
) -> None:
407+
channel_name = _unique_name("doc-unarch")
408+
create_resp = await slack_client.post(
409+
"/conversations.create", json={"name": channel_name, "is_private": False}
410+
)
411+
assert create_resp.status_code == 200
412+
channel_id = create_resp.json()["channel"]["id"]
413+
414+
await slack_client.post(
415+
"/conversations.archive", json={"channel": channel_id}
416+
)
417+
418+
resp = await slack_client.post(
419+
"/conversations.unarchive", json={"channel": channel_id}
420+
)
421+
assert resp.status_code == 200
422+
data = resp.json()
423+
assert data["ok"] is True
424+
425+
async def test_conversations_rename_doc_shape(
426+
self, slack_client: AsyncClient
427+
) -> None:
428+
channel_name = _unique_name("doc-rename")
429+
create_resp = await slack_client.post(
430+
"/conversations.create", json={"name": channel_name, "is_private": False}
431+
)
432+
assert create_resp.status_code == 200
433+
channel_id = create_resp.json()["channel"]["id"]
434+
435+
new_name = _unique_name("doc-renamed")
436+
resp = await slack_client.post(
437+
"/conversations.rename",
438+
json={"channel": channel_id, "name": new_name},
439+
)
440+
assert resp.status_code == 200
441+
data = resp.json()
442+
assert data["ok"] is True
443+
assert data["channel"]["name"] == new_name
444+
445+
async def test_conversations_kick_doc_shape(
446+
self, slack_client: AsyncClient, slack_client_john: AsyncClient
447+
) -> None:
448+
channel_name = _unique_name("doc-kick")
449+
create_resp = await slack_client.post(
450+
"/conversations.create", json={"name": channel_name, "is_private": False}
451+
)
452+
assert create_resp.status_code == 200
453+
channel_id = create_resp.json()["channel"]["id"]
454+
455+
await slack_client.post(
456+
"/conversations.invite",
457+
json={"channel": channel_id, "users": USER_JOHN},
458+
)
459+
460+
resp = await slack_client.post(
461+
"/conversations.kick",
462+
json={"channel": channel_id, "user": USER_JOHN},
463+
)
464+
assert resp.status_code == 200
465+
data = resp.json()
466+
assert data["ok"] is True
467+
468+
async def test_conversations_members_doc_shape(
469+
self, slack_client: AsyncClient
470+
) -> None:
471+
resp = await slack_client.get(
472+
f"/conversations.members?channel={CHANNEL_GENERAL}&limit=10"
473+
)
474+
assert resp.status_code == 200
475+
data = resp.json()
476+
assert data["ok"] is True
477+
assert "members" in data
478+
assert isinstance(data["members"], list)
479+
assert "response_metadata" in data
480+
481+
async def test_users_list_doc_shape(self, slack_client: AsyncClient) -> None:
482+
resp = await slack_client.get("/users.list?limit=5")
483+
assert resp.status_code == 200
484+
data = resp.json()
485+
assert data["ok"] is True
486+
assert "members" in data
487+
assert isinstance(data["members"], list)
488+
if data["members"]:
489+
user = data["members"][0]
490+
assert {"id", "name", "profile"} <= user.keys()
491+
492+
async def test_users_conversations_doc_shape(
493+
self, slack_client: AsyncClient
494+
) -> None:
495+
resp = await slack_client.get(f"/users.conversations?user={USER_AGENT}&limit=5")
496+
assert resp.status_code == 200
497+
data = resp.json()
498+
assert data["ok"] is True
499+
assert "channels" in data
500+
assert isinstance(data["channels"], list)
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# API Conformance Testing
2+
3+
## Overview
4+
5+
This directory contains conformance tests that validate Agent-Diff API replicas against their real-world production counterparts. The tests compare **response schema/shape**, **status codes**, **error semantics**, **mutation behavior**, and **pagination** — not exact values, since IDs and timestamps naturally differ between environments.
6+
7+
## What Existed Before
8+
9+
Prior to this expansion, conformance tests existed for Box, Calendar, and Linear as production parity tests, and Slack as docs-golden (replica-only) tests. Coverage was uneven:
10+
11+
- **Box**: Comprehensive — response shapes, error codes (404/400/409), edge cases, pagination, field filtering
12+
- **Calendar**: Moderate — response shapes and basic error handling (404), but no pagination parity or extended error coverage
13+
- **Linear**: Query-focused — GraphQL filter testing and schema introspection, but limited error parity and no pagination testing
14+
- **Slack**: No production parity — only docs-golden tests validating response shapes against the Slack API documentation, not the live API
15+
16+
## What Was Added
17+
18+
As requested by reviewers, we expanded the conformance suite to cover all four services uniformly:
19+
20+
### New: Slack Production Parity (`test_slack_parity.py`)
21+
22+
Built from scratch following the Box testing pattern. Compares Slack replica against the real Slack API across:
23+
- **Read-only shape parity**: auth.test, users.info, users.list, conversations.list, conversations.info, conversations.history, conversations.members, users.conversations
24+
- **Write operation parity**: conversations.create, chat.postMessage, chat.update, chat.delete, conversations.setTopic, conversations.rename, conversations.invite, conversations.kick, conversations.open, conversations.join, conversations.leave, conversations.archive, conversations.unarchive, conversations.replies
25+
- **Error parity**: no_text, channel_not_found, message_not_found, user_not_found, already_archived
26+
- **Pagination parity**: cursor-based pagination for conversations.list, conversations.history, users.list
27+
28+
### Expanded: Calendar (`test_calendar_parity_comprehensive.py`)
29+
30+
Added two new test sections:
31+
- **Extended error handling**: Invalid time ranges (end before start), missing required fields, delete non-existent calendar, events for non-existent calendar, ACL with invalid role
32+
- **Pagination parity**: Events and CalendarList with maxResults=1, nextPageToken following
33+
34+
### Expanded: Linear (`test_linear_parity_comprehensive.py`)
35+
36+
Added three new test sections:
37+
- **Error response parity**: Non-existent issue by UUID, mutation with invalid team ID, malformed UUID — validates both environments return errors for the same inputs
38+
- **Pagination parity**: issues(first:1) and issues(last:1) pageInfo shape, cursor-based pagination following
39+
- **Earlier fixes**: Removed 3 invalid test cases that tested replica extensions not present in production (labels.none, comments.none filters; missing title validation strictness)
40+
41+
### Existing: Slack Docs-Golden (`test_slack_conformance.py`)
42+
43+
Retained as a complementary replica-only validation layer (22 tests). These run without API credentials and validate response shapes against documented Slack API contracts.
44+
45+
## Results
46+
47+
| Service | Tests | Passed | Rate | Skipped | Method |
48+
|---------|-------|--------|------|---------|--------|
49+
| Box | 106 | 105 | **99%** | 0 | Production parity (REST) |
50+
| Calendar | 85 | 79 | **92%** | 0 | Production parity (REST) |
51+
| Linear | 96 | 94 | **97%** | 0 | Production parity (GraphQL) + introspection |
52+
| Slack (parity) | 27 | 27 | **100%** | 7 | Production parity (REST) |
53+
| Slack (docs-golden) | 22 | 22 | **100%** | 0 | Replica vs documented contracts |
54+
| **Total** | **336** | **327** | **97%** | **7** | |
55+
56+
### What Passed
57+
58+
Across all four services, the following core API behaviors are confirmed to match production:
59+
60+
- **Response schema/shape parity**: All CRUD operations (create, read, update, delete) return structurally identical responses between replicas and production APIs. Field names, nesting, types, and list structures match.
61+
- **Error code parity**: Replicas return the same error codes as production for invalid inputs — `404` for non-existent resources, `400` for malformed requests, `channel_not_found` / `user_not_found` / `no_text` / `message_not_found` for Slack-specific errors.
62+
- **Pagination behavior**: Cursor-based (Slack, Linear) and token-based (Calendar) pagination produces structurally identical responses. Page sizes are respected, continuation tokens work correctly.
63+
- **Mutation semantics**: Create, update, and delete operations produce equivalent state changes and response shapes across all services.
64+
- **GraphQL schema fidelity** (Linear): Introspection comparison confirms that query/mutation fields, input types, and object types are aligned between production and replica on all benchmark-relevant surfaces.
65+
66+
### Minor Issues Identified
67+
68+
The expanded test suite identified a small number of minor discrepancies, none of which affect benchmark scoring or the validity of reported results. These will be addressed before publication:
69+
70+
- **Calendar**: The replica accepts events with end time before start time (Google Calendar returns HTTP 400). This is an input validation gap — the replica processes the request rather than rejecting it. Four event list responses are missing computed fields that Google injects server-side. These do not affect the benchmark because no benchmark task depends on time-range validation rejection or these specific computed fields.
71+
- **Linear**: Schema introspection detects 2 fields recently added to Linear's production API (`activity`, `hasSharedUsers` on `IssueFilter`) that the replica does not yet implement. These are new Linear features not used by any benchmark task.
72+
- **Box**: One edge case in collection operations. Does not affect any benchmark task.
73+
74+
## How to Run
75+
76+
```bash
77+
# All conformance tests
78+
pytest -m conformance -v
79+
80+
# Individual services (production parity — requires API credentials)
81+
BOX_DEV_TOKEN=<token> pytest tests/validation/test_box_parity.py -v -s
82+
GOOGLE_CALENDAR_ACCESS_TOKEN=<token> pytest tests/validation/test_calendar_parity_comprehensive.py -v -s
83+
LINEAR_API_KEY=<key> pytest tests/validation/test_linear_parity_comprehensive.py -v -s
84+
SLACK_BOT_TOKEN=<token> pytest tests/validation/test_slack_parity.py -v -s
85+
86+
# Slack docs-golden (no credentials needed, runs against replica)
87+
pytest tests/validation/test_slack_conformance.py -v
88+
89+
# Or run standalone with detailed output:
90+
BOX_DEV_TOKEN=<token> python tests/validation/test_box_parity.py
91+
```
92+
93+
**Prerequisites:**
94+
- Backend replica running (`cd ops && make up`)
95+
- For Slack docs-golden: run inside Docker (`docker exec ops-backend-1 pytest ...`) or have local database access

0 commit comments

Comments
 (0)