Skip to content

Commit 8adfb2d

Browse files
David koleczek/task updates (#50)
* new tasks, and task updates * update version
1 parent dcc0731 commit 8adfb2d

56 files changed

Lines changed: 1971 additions & 470 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ on:
66
pull_request:
77
branches: [main]
88

9+
permissions:
10+
contents: read
11+
912
jobs:
1013
quality:
1114
name: Code Quality

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ media/
44
.benchmark_results/
55
.comparison_results/
66
data/tasks/arc-agi-2-*/
7-
data/tasks/frontier-science-*/
7+
data/tasks/frontier-science/
88
ai_working/
99

1010
# Benchmarking generated reports

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ which are a mix of code and LLM calls to achieve a desired tradeoff between flex
2222
```bash
2323
# Benchmarking requires certain prerequisites, see the full documentation for more details.
2424
# With uv (add to project dependencies, pinned to a release tag)
25-
uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.35"
25+
uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.36"
2626

2727
# With pip
28-
pip install "git+https://github.com/microsoft/eval-recipes@v0.0.35"
28+
pip install "git+https://github.com/microsoft/eval-recipes@v0.0.36"
2929
```
3030

3131
> [!WARNING]

data/agents/amplifier_foundation/agent.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ runtime_files:
1313
- source: data
1414
dest: /project
1515
command_template: >-
16-
amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-5" --provider anthropic "{{task_instructions}}"
16+
amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-6" --provider anthropic "{{task_instructions}}"
1717
command_template_continue: >-
1818
amplifier continue "{{task_instructions}}"
19-
agent_logs_paths: []
20-
source_code_path: null
19+
agent_logs_paths:
20+
- "Session logs are under `~/.amplifier/projects/`"
21+
source_code_path: null

data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ config:
44
config:
55
api_key: ${ANTHROPIC_API_KEY}
66
base_url: https://api.anthropic.com
7-
default_model: claude-opus-4-5-20251101
7+
default_model: claude-opus-4-6
8+
enable_prompt_caching: 'true'
89
priority: 1
9-
source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main
10+
source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main

data/agents/gh_cli/agent.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ dockerfile_portion: |
88
installation_files: []
99
runtime_files: []
1010
command_template: >-
11-
copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.5"
11+
copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.6"
1212
command_template_continue: >-
13-
copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.5"
13+
copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.6"
1414
agent_logs_paths: []
1515
source_code_path: null

data/agents/openai_codex/agent.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ dockerfile_portion: |
55
installation_files: []
66
runtime_files: []
77
command_template: >-
8-
printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" "{{task_instructions}}"
8+
printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" "{{task_instructions}}"
99
command_template_continue: >-
10-
printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" resume --last
10+
printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" resume --last
1111
agent_logs_paths: []
1212
source_code_path: null

data/benchmarks/full_benchmark.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,19 @@ comparison_benchmark:
99
- claude_code
1010
- gh_cli
1111
- amplifier_foundation
12+
- openai_codex
1213
- task_name: ppt-2
1314
agent_ids:
1415
- claude_code
1516
- gh_cli
1617
- amplifier_foundation
18+
- openai_codex
1719
- task_name: ppt-3
1820
agent_ids:
1921
- claude_code
2022
- gh_cli
2123
- amplifier_foundation
24+
- openai_codex
2225

2326
score_benchmark:
2427
benchmark_type: score
@@ -29,14 +32,19 @@ score_benchmark:
2932
task_names:
3033
- arxiv_conclusion_extraction
3134
- arxiv_paper_summarizer
35+
- chiptune_generator
3236
- code-discrepancy-docs-knack
3337
- code-discrepancy-docstrings-grasp
3438
- code-discrepancy-tutorials-grasp
3539
- cpsc_recall_monitor
3640
- cross_repo_improvement_tool
3741
- email_drafting
42+
- energy_forecast_new_england
3843
- gdpval_extraction
44+
- git_changelog_generator
3945
- github_docs_extractor
46+
- image_tagging
47+
- ipo_tracker
4048
- linkedin_drafting
4149
- markdown_deck_converter
4250
- news_research_tool
@@ -45,6 +53,7 @@ score_benchmark:
4553
- pdf-hr-q3
4654
- pdf-hr-q4
4755
- pdf-hr-q5
56+
- pixel_art_generator
4857
- product_review_finder
4958
- repo_embedding_server
5059
- sec_10q_extractor
@@ -54,14 +63,19 @@ score_benchmark:
5463
task_names:
5564
- arxiv_conclusion_extraction
5665
- arxiv_paper_summarizer
66+
- chiptune_generator
5767
- code-discrepancy-docs-knack
5868
- code-discrepancy-docstrings-grasp
5969
- code-discrepancy-tutorials-grasp
6070
- cpsc_recall_monitor
6171
- cross_repo_improvement_tool
6272
- email_drafting
73+
- energy_forecast_new_england
6374
- gdpval_extraction
75+
- git_changelog_generator
6476
- github_docs_extractor
77+
- image_tagging
78+
- ipo_tracker
6579
- linkedin_drafting
6680
- markdown_deck_converter
6781
- news_research_tool
@@ -70,6 +84,7 @@ score_benchmark:
7084
- pdf-hr-q3
7185
- pdf-hr-q4
7286
- pdf-hr-q5
87+
- pixel_art_generator
7388
- product_review_finder
7489
- repo_embedding_server
7590
- sec_10q_extractor
@@ -79,14 +94,19 @@ score_benchmark:
7994
task_names:
8095
- arxiv_conclusion_extraction
8196
- arxiv_paper_summarizer
97+
- chiptune_generator
8298
- code-discrepancy-docs-knack
8399
- code-discrepancy-docstrings-grasp
84100
- code-discrepancy-tutorials-grasp
85101
- cpsc_recall_monitor
86102
- cross_repo_improvement_tool
87103
- email_drafting
104+
- energy_forecast_new_england
88105
- gdpval_extraction
106+
- git_changelog_generator
89107
- github_docs_extractor
108+
- image_tagging
109+
- ipo_tracker
90110
- linkedin_drafting
91111
- markdown_deck_converter
92112
- news_research_tool
@@ -95,6 +115,38 @@ score_benchmark:
95115
- pdf-hr-q3
96116
- pdf-hr-q4
97117
- pdf-hr-q5
118+
- pixel_art_generator
119+
- product_review_finder
120+
- repo_embedding_server
121+
- sec_10q_extractor
122+
- style_blender
123+
trials: 1
124+
- agent_id: openai_codex
125+
task_names:
126+
- arxiv_conclusion_extraction
127+
- arxiv_paper_summarizer
128+
- chiptune_generator
129+
- code-discrepancy-docs-knack
130+
- code-discrepancy-docstrings-grasp
131+
- code-discrepancy-tutorials-grasp
132+
- cpsc_recall_monitor
133+
- cross_repo_improvement_tool
134+
- email_drafting
135+
- energy_forecast_new_england
136+
- gdpval_extraction
137+
- git_changelog_generator
138+
- github_docs_extractor
139+
- image_tagging
140+
- ipo_tracker
141+
- linkedin_drafting
142+
- markdown_deck_converter
143+
- news_research_tool
144+
- pdf-hr-q1
145+
- pdf-hr-q2
146+
- pdf-hr-q3
147+
- pdf-hr-q4
148+
- pdf-hr-q5
149+
- pixel_art_generator
98150
- product_review_finder
99151
- repo_embedding_server
100152
- sec_10q_extractor

data/tasks/arxiv_conclusion_extraction/task.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,11 @@ task_info:
33
difficulty: easy
44
non_deterministic_evals: false
55
categories:
6-
- document_processing
7-
- research
6+
- web
87
evaluation_configs:
98
- type: score
109
test_script: test.py
1110
test_command: uv run --no-project /project/test.py --conclusion /project/Conclusion.txt
12-
dockerfile_portion: ""
1311
instructions: >-
1412
Can you download the paper at https://arxiv.org/abs/2411.10541 and write to a file
1513
named "Conclusion.txt" the exact conclusion of the paper (not including the name of the section)?

data/tasks/arxiv_paper_summarizer/task.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@ task_info:
44
non_deterministic_evals: true
55
categories:
66
- cli_tool
7-
- research
8-
- document_processing
7+
- ai_tool
98
evaluation_configs:
109
- type: score
1110
test_script: test.py
1211
test_command: uv run --no-project /project/test.py
13-
dockerfile_portion: ""
1412
instructions: |
1513
Build a CLI tool that downloads and summarizes academic papers from arxiv.org, then validates the summaries using claim verification.
1614

0 commit comments

Comments
 (0)