microsoft · DavidKoleczek · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -6,6 +6,9 @@ on:
   pull_request:
     branches: [main]
 
+permissions:
+  contents: read
+
 jobs:
   quality:
     name: Code Quality

diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@ media/
 .benchmark_results/
 .comparison_results/
 data/tasks/arc-agi-2-*/
-data/tasks/frontier-science-*/
+data/tasks/frontier-science/
 ai_working/
 
 # Benchmarking generated reports

diff --git a/README.md b/README.md
@@ -22,10 +22,10 @@ which are a mix of code and LLM calls to achieve a desired tradeoff between flex
 ```bash
 # Benchmarking requires certain prerequisites, see the full documentation for more details.
 # With uv (add to project dependencies, pinned to a release tag)
-uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.35"
+uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.36"
 
 # With pip
-pip install "git+https://github.com/microsoft/eval-recipes@v0.0.35"
+pip install "git+https://github.com/microsoft/eval-recipes@v0.0.36"
 ```
 
 > [!WARNING]

diff --git a/data/agents/amplifier_foundation/agent.yaml b/data/agents/amplifier_foundation/agent.yaml
@@ -13,8 +13,9 @@ runtime_files:
   - source: data
     dest: /project
 command_template: >-
-  amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-5" --provider anthropic "{{task_instructions}}"
+  amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-6" --provider anthropic "{{task_instructions}}"
 command_template_continue: >-
   amplifier continue "{{task_instructions}}"
-agent_logs_paths: []
-source_code_path: null
+agent_logs_paths:
+  - "Session logs are under `~/.amplifier/projects/`"
+source_code_path: null
diff --git a/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml b/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml
@@ -4,6 +4,7 @@ config:
     config:
       api_key: ${ANTHROPIC_API_KEY}
       base_url: https://api.anthropic.com
-      default_model: claude-opus-4-5-20251101
+      default_model: claude-opus-4-6
+      enable_prompt_caching: 'true'
       priority: 1
-    source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main
+    source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main
diff --git a/data/agents/gh_cli/agent.yaml b/data/agents/gh_cli/agent.yaml
@@ -8,8 +8,8 @@ dockerfile_portion: |
 installation_files: []
 runtime_files: []
 command_template: >-
-  copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.5"
+  copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.6"
 command_template_continue: >-
-  copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.5"
+  copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.6"
 agent_logs_paths: []
 source_code_path: null
diff --git a/data/agents/openai_codex/agent.yaml b/data/agents/openai_codex/agent.yaml
@@ -5,8 +5,8 @@ dockerfile_portion: |
 installation_files: []
 runtime_files: []
 command_template: >-
-  printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" "{{task_instructions}}"
+  printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" "{{task_instructions}}"
 command_template_continue: >-
-  printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" resume --last
+  printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" resume --last
 agent_logs_paths: []
 source_code_path: null
diff --git a/data/benchmarks/full_benchmark.yaml b/data/benchmarks/full_benchmark.yaml
@@ -9,16 +9,19 @@ comparison_benchmark:
         - claude_code
         - gh_cli
         - amplifier_foundation
+        - openai_codex
     - task_name: ppt-2
       agent_ids:
         - claude_code
         - gh_cli
         - amplifier_foundation
+        - openai_codex
     - task_name: ppt-3
       agent_ids:
         - claude_code
         - gh_cli
         - amplifier_foundation
+        - openai_codex
 
 score_benchmark:
   benchmark_type: score
@@ -29,14 +32,19 @@ score_benchmark:
       task_names:
         - arxiv_conclusion_extraction
         - arxiv_paper_summarizer
+        - chiptune_generator
         - code-discrepancy-docs-knack
         - code-discrepancy-docstrings-grasp
         - code-discrepancy-tutorials-grasp
         - cpsc_recall_monitor
         - cross_repo_improvement_tool
         - email_drafting
+        - energy_forecast_new_england
         - gdpval_extraction
+        - git_changelog_generator
         - github_docs_extractor
+        - image_tagging
+        - ipo_tracker
         - linkedin_drafting
         - markdown_deck_converter
         - news_research_tool
@@ -45,6 +53,7 @@ score_benchmark:
         - pdf-hr-q3
         - pdf-hr-q4
         - pdf-hr-q5
+        - pixel_art_generator
         - product_review_finder
         - repo_embedding_server
         - sec_10q_extractor
@@ -54,14 +63,19 @@ score_benchmark:
       task_names:
         - arxiv_conclusion_extraction
         - arxiv_paper_summarizer
+        - chiptune_generator
         - code-discrepancy-docs-knack
         - code-discrepancy-docstrings-grasp
         - code-discrepancy-tutorials-grasp
         - cpsc_recall_monitor
         - cross_repo_improvement_tool
         - email_drafting
+        - energy_forecast_new_england
         - gdpval_extraction
+        - git_changelog_generator
         - github_docs_extractor
+        - image_tagging
+        - ipo_tracker
         - linkedin_drafting
         - markdown_deck_converter
         - news_research_tool
@@ -70,6 +84,7 @@ score_benchmark:
         - pdf-hr-q3
         - pdf-hr-q4
         - pdf-hr-q5
+        - pixel_art_generator
         - product_review_finder
         - repo_embedding_server
         - sec_10q_extractor
@@ -79,14 +94,19 @@ score_benchmark:
       task_names:
         - arxiv_conclusion_extraction
         - arxiv_paper_summarizer
+        - chiptune_generator
         - code-discrepancy-docs-knack
         - code-discrepancy-docstrings-grasp
         - code-discrepancy-tutorials-grasp
         - cpsc_recall_monitor
         - cross_repo_improvement_tool
         - email_drafting
+        - energy_forecast_new_england
         - gdpval_extraction
+        - git_changelog_generator
         - github_docs_extractor
+        - image_tagging
+        - ipo_tracker
         - linkedin_drafting
         - markdown_deck_converter
         - news_research_tool
@@ -95,6 +115,38 @@ score_benchmark:
         - pdf-hr-q3
         - pdf-hr-q4
         - pdf-hr-q5
+        - pixel_art_generator
+        - product_review_finder
+        - repo_embedding_server
+        - sec_10q_extractor
+        - style_blender
+      trials: 1
+    - agent_id: openai_codex
+      task_names:
+        - arxiv_conclusion_extraction
+        - arxiv_paper_summarizer
+        - chiptune_generator
+        - code-discrepancy-docs-knack
+        - code-discrepancy-docstrings-grasp
+        - code-discrepancy-tutorials-grasp
+        - cpsc_recall_monitor
+        - cross_repo_improvement_tool
+        - email_drafting
+        - energy_forecast_new_england
+        - gdpval_extraction
+        - git_changelog_generator
+        - github_docs_extractor
+        - image_tagging
+        - ipo_tracker
+        - linkedin_drafting
+        - markdown_deck_converter
+        - news_research_tool
+        - pdf-hr-q1
+        - pdf-hr-q2
+        - pdf-hr-q3
+        - pdf-hr-q4
+        - pdf-hr-q5
+        - pixel_art_generator
         - product_review_finder
         - repo_embedding_server
         - sec_10q_extractor

diff --git a/data/tasks/arxiv_conclusion_extraction/task.yaml b/data/tasks/arxiv_conclusion_extraction/task.yaml
@@ -3,13 +3,11 @@ task_info:
   difficulty: easy
   non_deterministic_evals: false
   categories:
-    - document_processing
-    - research
+    - web
 evaluation_configs:
   - type: score
     test_script: test.py
     test_command: uv run --no-project /project/test.py --conclusion /project/Conclusion.txt
-dockerfile_portion: ""
 instructions: >-
   Can you download the paper at https://arxiv.org/abs/2411.10541 and write to a file
   named "Conclusion.txt" the exact conclusion of the paper (not including the name of the section)?

diff --git a/data/tasks/arxiv_paper_summarizer/task.yaml b/data/tasks/arxiv_paper_summarizer/task.yaml
@@ -4,13 +4,11 @@ task_info:
   non_deterministic_evals: true
   categories:
     - cli_tool
-    - research
-    - document_processing
+    - ai_tool
 evaluation_configs:
   - type: score
     test_script: test.py
     test_command: uv run --no-project /project/test.py
-dockerfile_portion: ""
 instructions: |
   Build a CLI tool that downloads and summarizes academic papers from arxiv.org, then validates the summaries using claim verification.
 

diff --git a/data/tasks/chiptune_generator/task.yaml b/data/tasks/chiptune_generator/task.yaml
@@ -0,0 +1,45 @@
+name: chiptune_generator
+task_info:
+  difficulty: hard
+  non_deterministic_evals: true
+  categories:
+    - cli_tool
+    - ai_tool
+evaluation_configs:
+  - type: score
+    test_script: test.py
+    test_command: uv run --no-project /project/test.py
+dockerfile_portion: |
+  RUN apt-get update && apt-get install -y --no-install-recommends \
+      fluidsynth \
+      fluid-soundfont-gm \
+      && rm -rf /var/lib/apt/lists/*
+instructions: |
+  I want a command-line tool that generates chip tune music from plain English descriptions. Chip tunes are that retro 8-bit sound from classic game consoles - square waves, triangle waves, arpeggios, etc. 
+  I want to describe what I'm looking for and get a MIDI file out.
+
+  The basic workflow would be something like:
+
+  - `chiptune generate "an upbeat battle theme with fast arpeggios" --output battle.mid`
+
+  - Listen to it, decide I want changes
+
+  - `chiptune refine battle.mid "make the bass line more prominent and add faster arpeggios" --output battle_v2.mid`
+
+  - Repeat until I'm happy
+
+  The generate command should use an LLM to interpret my description and figure out the right tempo, key, instruments, etc. A description like "melancholy and slow" should sound very different from "fast energetic boss battle." 
+  The output should be a valid MIDI file with multiple channels - lead melody, harmony, bass, percussion - using instruments that actually sound like chip tunes, not piano or strings.
+
+  The refine command is important to me. It needs to actually read and understand the existing MIDI file, then apply my feedback to it.
+
+  I'd also like an analyze command (`chiptune analyze battle.mid`) that prints a human-readable breakdown of a MIDI file - tempo, duration, what channels are being used, note counts, pitch ranges, that kind of thing. Useful for understanding what the tool produced.
+
+  The generated tunes should be somewhere between 30 seconds and 2 minutes long.
+
+  Please include a README with examples of all three commands.
+  You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM API calls you need to make.
+test_time_files:
+  - source: test_time_data
+    dest: /project
+timeout: 4500