Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: pip install pytest
# Tests use sys.path manipulation to find local modules, so we
# don't need to install the project as a package — just install
# the dependencies that test modules import (directly or
# transitively via experiments/ablations/evolutionary_optimization.py).
run: pip install pytest anthropic

- name: Run tests
run: pytest tests/ -v
32 changes: 32 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
{"model": "haiku", "display_name": "Claude Code (Haiku 4.5)"},
{"model": "claude-sonnet-4-6", "display_name": "Claude Code (Sonnet 4.6)"},
{"model": "claude-opus-4-6", "display_name": "Claude Code (Opus 4.6)"},
{"model": "claude-opus-4-7", "display_name": "Claude Code (Opus 4.7)"},
],
"gemini": [
{"model": "gemini-2.5-pro", "display_name": "Gemini CLI (2.5 Pro)"},
Expand All @@ -54,6 +55,36 @@
{"model": "nvidia/nemotron-3-super", "display_name": "Nemotron-3-Super"},
{"model": "nvidia/Nemotron-120B-A12B", "display_name": "Nemotron-120B-A12B (Baseten)"},
],
# `opencode` is the generic OpenCode-CLI agent, supporting any provider
# OpenCode can route to. Model strings are <provider>/<model>; the
# provider key matches a block in the per-sandbox `.opencode.json`.
# Use this to compare the same model under different scaffoldings:
# opencode + anthropic/claude-haiku-4-5-20251001 vs. claude + haiku
# opencode + google/gemini-3.1-pro-preview vs. gemini + gemini-3.1-pro-preview
# `nemotron` (above) is kept as a deprecated alias that maps to the same
# OpenCode invocation but writes to the legacy `nemotron-<model>` slug.
"opencode": [
# Note the doubled `nvidia/` — OpenCode splits at the first slash
# to get provider=`nvidia`, model=`nvidia/nemotron-3-super`, which
# matches the literal name the Baseten endpoint serves the model
# under. See experiments/_opencode_config.py for the full rationale.
{"model": "nvidia/nvidia/nemotron-3-super",
"display_name": "OpenCode (Nemotron-3-Super)"},
{"model": "anthropic/claude-haiku-4-5-20251001",
"display_name": "OpenCode (Haiku 4.5)"},
{"model": "anthropic/claude-sonnet-4-5-20250929",
"display_name": "OpenCode (Sonnet 4.5)"},
{"model": "anthropic/claude-sonnet-4-6",
"display_name": "OpenCode (Sonnet 4.6)"},
{"model": "anthropic/claude-opus-4-7",
"display_name": "OpenCode (Opus 4.7)"},
{"model": "google/gemini-3.1-pro-preview",
"display_name": "OpenCode (Gemini 3.1 Pro)"},
{"model": "google/gemini-3-flash-preview",
"display_name": "OpenCode (Gemini 3 Flash)"},
{"model": "vllm/deepseek-ai/DeepSeek-V4-Flash",
"display_name": "OpenCode (DeepSeek V4 Flash)"},
],
}

SKILL_MAPPING: dict[str, str] = {
Expand Down Expand Up @@ -83,6 +114,7 @@
"gemini": 5,
"vibe": 10,
"nemotron": 5,
"opencode": 5,
}

POLICY_CONFIGS: dict[str, dict] = {
Expand Down
808 changes: 808 additions & 0 deletions data/contextual_injections_autoresearch_full.json

Large diffs are not rendered by default.

18,315 changes: 18,315 additions & 0 deletions data/contextual_injections_encoded.json

Large diffs are not rendered by default.

766 changes: 766 additions & 0 deletions data/contextual_injections_stego.json

Large diffs are not rendered by default.

499 changes: 499 additions & 0 deletions data/skill_md_overrides_autoresearch/2026-05-12-son-006/SKILL.md

Large diffs are not rendered by default.

499 changes: 499 additions & 0 deletions data/skill_md_overrides_autoresearch/2026-05-12-son-045/SKILL.md

Large diffs are not rendered by default.

499 changes: 499 additions & 0 deletions data/skill_md_overrides_autoresearch/2026-05-12-son-068/SKILL.md

Large diffs are not rendered by default.

Loading
Loading