diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 56527d5..784b702 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -6,6 +6,9 @@ on: pull_request: branches: [main] +permissions: + contents: read + jobs: quality: name: Code Quality diff --git a/.gitignore b/.gitignore index 882d447..c25fb41 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ media/ .benchmark_results/ .comparison_results/ data/tasks/arc-agi-2-*/ -data/tasks/frontier-science-*/ +data/tasks/frontier-science/ ai_working/ # Benchmarking generated reports diff --git a/README.md b/README.md index 4b1a3f3..cc2f193 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ which are a mix of code and LLM calls to achieve a desired tradeoff between flex ```bash # Benchmarking requires certain prerequisites, see the full documentation for more details. # With uv (add to project dependencies, pinned to a release tag) -uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.35" +uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.36" # With pip -pip install "git+https://github.com/microsoft/eval-recipes@v0.0.35" +pip install "git+https://github.com/microsoft/eval-recipes@v0.0.36" ``` > [!WARNING] diff --git a/data/agents/amplifier_foundation/agent.yaml b/data/agents/amplifier_foundation/agent.yaml index 97873c9..0a49654 100644 --- a/data/agents/amplifier_foundation/agent.yaml +++ b/data/agents/amplifier_foundation/agent.yaml @@ -13,8 +13,9 @@ runtime_files: - source: data dest: /project command_template: >- - amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-5" --provider anthropic "{{task_instructions}}" + amplifier run --bundle foundation --verbose --output-format "json" --model "claude-opus-4-6" --provider anthropic "{{task_instructions}}" command_template_continue: >- amplifier continue "{{task_instructions}}" -agent_logs_paths: [] -source_code_path: null +agent_logs_paths: + - "Session logs are under `~/.amplifier/projects/`" +source_code_path: null \ No newline at end of file diff --git a/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml b/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml index 0933cf1..364f585 100644 --- a/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml +++ b/data/agents/amplifier_foundation/data/.amplifier/settings.local.yaml @@ -4,6 +4,7 @@ config: config: api_key: ${ANTHROPIC_API_KEY} base_url: https://api.anthropic.com - default_model: claude-opus-4-5-20251101 + default_model: claude-opus-4-6 + enable_prompt_caching: 'true' priority: 1 - source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main + source: git+https://github.com/microsoft/amplifier-module-provider-anthropic@main \ No newline at end of file diff --git a/data/agents/gh_cli/agent.yaml b/data/agents/gh_cli/agent.yaml index 21d753a..fc9905f 100644 --- a/data/agents/gh_cli/agent.yaml +++ b/data/agents/gh_cli/agent.yaml @@ -8,8 +8,8 @@ dockerfile_portion: | installation_files: [] runtime_files: [] command_template: >- - copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.5" + copilot -p "{{task_instructions}}" --allow-all-tools --log-level all --stream off --model "claude-opus-4.6" command_template_continue: >- - copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.5" + copilot -p "{{task_instructions}}" --continue --allow-all-tools --log-level all --stream off --model "claude-opus-4.6" agent_logs_paths: [] source_code_path: null diff --git a/data/agents/openai_codex/agent.yaml b/data/agents/openai_codex/agent.yaml index af6f602..9aad27c 100644 --- a/data/agents/openai_codex/agent.yaml +++ b/data/agents/openai_codex/agent.yaml @@ -5,8 +5,8 @@ dockerfile_portion: | installation_files: [] runtime_files: [] command_template: >- - printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" "{{task_instructions}}" + printenv OPENAI_API_KEY | codex login --with-api-key && codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" "{{task_instructions}}" command_template_continue: >- - printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.1-codex-max" -c model_reasoning_effort="high" resume --last + printenv OPENAI_API_KEY | codex login --with-api-key && echo "{{task_instructions}}" | codex exec --skip-git-repo-check --json --dangerously-bypass-approvals-and-sandbox -c model="gpt-5.2-codex" -c model_reasoning_effort="high" resume --last agent_logs_paths: [] source_code_path: null diff --git a/data/benchmarks/full_benchmark.yaml b/data/benchmarks/full_benchmark.yaml index beedc45..53150dc 100644 --- a/data/benchmarks/full_benchmark.yaml +++ b/data/benchmarks/full_benchmark.yaml @@ -9,16 +9,19 @@ comparison_benchmark: - claude_code - gh_cli - amplifier_foundation + - openai_codex - task_name: ppt-2 agent_ids: - claude_code - gh_cli - amplifier_foundation + - openai_codex - task_name: ppt-3 agent_ids: - claude_code - gh_cli - amplifier_foundation + - openai_codex score_benchmark: benchmark_type: score @@ -29,14 +32,19 @@ score_benchmark: task_names: - arxiv_conclusion_extraction - arxiv_paper_summarizer + - chiptune_generator - code-discrepancy-docs-knack - code-discrepancy-docstrings-grasp - code-discrepancy-tutorials-grasp - cpsc_recall_monitor - cross_repo_improvement_tool - email_drafting + - energy_forecast_new_england - gdpval_extraction + - git_changelog_generator - github_docs_extractor + - image_tagging + - ipo_tracker - linkedin_drafting - markdown_deck_converter - news_research_tool @@ -45,6 +53,7 @@ score_benchmark: - pdf-hr-q3 - pdf-hr-q4 - pdf-hr-q5 + - pixel_art_generator - product_review_finder - repo_embedding_server - sec_10q_extractor @@ -54,14 +63,19 @@ score_benchmark: task_names: - arxiv_conclusion_extraction - arxiv_paper_summarizer + - chiptune_generator - code-discrepancy-docs-knack - code-discrepancy-docstrings-grasp - code-discrepancy-tutorials-grasp - cpsc_recall_monitor - cross_repo_improvement_tool - email_drafting + - energy_forecast_new_england - gdpval_extraction + - git_changelog_generator - github_docs_extractor + - image_tagging + - ipo_tracker - linkedin_drafting - markdown_deck_converter - news_research_tool @@ -70,6 +84,7 @@ score_benchmark: - pdf-hr-q3 - pdf-hr-q4 - pdf-hr-q5 + - pixel_art_generator - product_review_finder - repo_embedding_server - sec_10q_extractor @@ -79,14 +94,19 @@ score_benchmark: task_names: - arxiv_conclusion_extraction - arxiv_paper_summarizer + - chiptune_generator - code-discrepancy-docs-knack - code-discrepancy-docstrings-grasp - code-discrepancy-tutorials-grasp - cpsc_recall_monitor - cross_repo_improvement_tool - email_drafting + - energy_forecast_new_england - gdpval_extraction + - git_changelog_generator - github_docs_extractor + - image_tagging + - ipo_tracker - linkedin_drafting - markdown_deck_converter - news_research_tool @@ -95,6 +115,38 @@ score_benchmark: - pdf-hr-q3 - pdf-hr-q4 - pdf-hr-q5 + - pixel_art_generator + - product_review_finder + - repo_embedding_server + - sec_10q_extractor + - style_blender + trials: 1 + - agent_id: openai_codex + task_names: + - arxiv_conclusion_extraction + - arxiv_paper_summarizer + - chiptune_generator + - code-discrepancy-docs-knack + - code-discrepancy-docstrings-grasp + - code-discrepancy-tutorials-grasp + - cpsc_recall_monitor + - cross_repo_improvement_tool + - email_drafting + - energy_forecast_new_england + - gdpval_extraction + - git_changelog_generator + - github_docs_extractor + - image_tagging + - ipo_tracker + - linkedin_drafting + - markdown_deck_converter + - news_research_tool + - pdf-hr-q1 + - pdf-hr-q2 + - pdf-hr-q3 + - pdf-hr-q4 + - pdf-hr-q5 + - pixel_art_generator - product_review_finder - repo_embedding_server - sec_10q_extractor diff --git a/data/tasks/arxiv_conclusion_extraction/task.yaml b/data/tasks/arxiv_conclusion_extraction/task.yaml index e579373..3e1a51c 100644 --- a/data/tasks/arxiv_conclusion_extraction/task.yaml +++ b/data/tasks/arxiv_conclusion_extraction/task.yaml @@ -3,13 +3,11 @@ task_info: difficulty: easy non_deterministic_evals: false categories: - - document_processing - - research + - web evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py --conclusion /project/Conclusion.txt -dockerfile_portion: "" instructions: >- Can you download the paper at https://arxiv.org/abs/2411.10541 and write to a file named "Conclusion.txt" the exact conclusion of the paper (not including the name of the section)? diff --git a/data/tasks/arxiv_paper_summarizer/task.yaml b/data/tasks/arxiv_paper_summarizer/task.yaml index ba21a92..c736cad 100644 --- a/data/tasks/arxiv_paper_summarizer/task.yaml +++ b/data/tasks/arxiv_paper_summarizer/task.yaml @@ -4,13 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - research - - document_processing + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | Build a CLI tool that downloads and summarizes academic papers from arxiv.org, then validates the summaries using claim verification. diff --git a/data/tasks/chiptune_generator/task.yaml b/data/tasks/chiptune_generator/task.yaml new file mode 100644 index 0000000..8770e65 --- /dev/null +++ b/data/tasks/chiptune_generator/task.yaml @@ -0,0 +1,45 @@ +name: chiptune_generator +task_info: + difficulty: hard + non_deterministic_evals: true + categories: + - cli_tool + - ai_tool +evaluation_configs: + - type: score + test_script: test.py + test_command: uv run --no-project /project/test.py +dockerfile_portion: | + RUN apt-get update && apt-get install -y --no-install-recommends \ + fluidsynth \ + fluid-soundfont-gm \ + && rm -rf /var/lib/apt/lists/* +instructions: | + I want a command-line tool that generates chip tune music from plain English descriptions. Chip tunes are that retro 8-bit sound from classic game consoles - square waves, triangle waves, arpeggios, etc. + I want to describe what I'm looking for and get a MIDI file out. + + The basic workflow would be something like: + + - `chiptune generate "an upbeat battle theme with fast arpeggios" --output battle.mid` + + - Listen to it, decide I want changes + + - `chiptune refine battle.mid "make the bass line more prominent and add faster arpeggios" --output battle_v2.mid` + + - Repeat until I'm happy + + The generate command should use an LLM to interpret my description and figure out the right tempo, key, instruments, etc. A description like "melancholy and slow" should sound very different from "fast energetic boss battle." + The output should be a valid MIDI file with multiple channels - lead melody, harmony, bass, percussion - using instruments that actually sound like chip tunes, not piano or strings. + + The refine command is important to me. It needs to actually read and understand the existing MIDI file, then apply my feedback to it. + + I'd also like an analyze command (`chiptune analyze battle.mid`) that prints a human-readable breakdown of a MIDI file - tempo, duration, what channels are being used, note counts, pitch ranges, that kind of thing. Useful for understanding what the tool produced. + + The generated tunes should be somewhere between 30 seconds and 2 minutes long. + + Please include a README with examples of all three commands. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM API calls you need to make. +test_time_files: + - source: test_time_data + dest: /project +timeout: 4500 diff --git a/data/tasks/chiptune_generator/test.py b/data/tasks/chiptune_generator/test.py new file mode 100644 index 0000000..0364bb8 --- /dev/null +++ b/data/tasks/chiptune_generator/test.py @@ -0,0 +1,302 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from pathlib import Path +import sys + +import click +from loguru import logger + +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( + get_instructions_from_file_or_default, + get_test_id_from_env_or_default, + write_test_result, +) + +# Define Semantic Test 1: Architecture and Implementation + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks below.""" + +STEPS_1_ARCHITECTURE = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code under scenarios/chiptune_generator/ to understand the full implementation. +2. Check how the solution generates MIDI files. It should use a proper MIDI library (e.g. \ +midiutil, mido, pretty_midi, or similar). Verify the library is listed in dependency files \ +and actually imported and used in the code. +3. Check how the solution uses an LLM or Agent SDK to interpret natural language descriptions. \ +There should be a reasonable separation of concerns - the LLM interprets the description into \ +musical parameters, and separate logic translates those parameters into MIDI data. It should \ +not be one giant function that does everything. +4. Check the MIDI generation code for chiptune-appropriate instrument choices: + - Look for use of synth lead programs (MIDI programs 80-87 range covers square waves, \ +sawtooth, etc.) or other chiptune-evocative sounds + - Check if multiple channels are used (melody, bass, harmony, percussion) + - The instrument choices should sound like chip tunes, not piano or orchestral instruments +5. Check for the three CLI commands the user requested: + - A "generate" command that takes a description and produces a MIDI file + - A "refine" command that takes an existing MIDI file and feedback to produce a modified file + - An "analyze" command that prints a human-readable breakdown of a MIDI file +6. Check that the refine command actually reads and parses the existing MIDI file to understand \ +its structure before modifying it. It should not simply regenerate from scratch while ignoring \ +the input file. +7. Check for proper error handling: invalid files, missing API keys, bad descriptions. +8. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK \ +is used, check which model it is configured with. Check the model identifier in the code against \ +these references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_ARCHITECTURE = { + "midi_library": ( + "str - (10 points) Does the solution use a proper MIDI library (midiutil, mido, " + "pretty_midi, or similar) for generating MIDI files? Check both dependency files " + "and actual imports/usage in code." + ), + "llm_midi_separation": ( + "str - (15 points) Is there a reasonable separation between LLM interpretation " + "(producing structured musical parameters) and MIDI generation (consuming those " + "parameters)? They should not be a single monolithic function." + ), + "chiptune_instruments": ( + "str - (15 points) Does the MIDI generation use chiptune-appropriate instruments? " + "Look for synth lead programs (MIDI programs 80-87 for square/sawtooth waves), " + "multiple channels (melody, bass, harmony, percussion), and instrument choices " + "that evoke retro 8-bit sounds rather than piano or orchestral sounds." + ), + "three_cli_commands": ("str - (10 points) Are there three CLI commands: generate, refine, and analyze?"), + "refine_reads_existing": ( + "str - (10 points) Does the refine command actually read and parse the existing " + "MIDI file to understand its structure before modifying it (not generating from scratch)?" + ), + "error_handling": ("str - (5 points) Does the code handle errors properly (invalid files, missing API keys)?"), + "readme_exists": ( + "str - (10 points) Does a README.md exist under scenarios/chiptune_generator/ " + "with usage examples for all three commands (generate, refine, analyze)?" + ), + "uses_recent_model": ( + "str - (25 points) Does it use a recent model from Anthropic (see " + "https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI " + "(see https://platform.openai.com/docs/models)? If an Agent SDK is used, check " + "which model it is configured with. 5 points partial credit if a model is used " + "but it is not recent." + ), + "score": ( + "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion." + ), +} + + +# Define Semantic Test 2: Functional -- Generation, Analysis, and Refinement + +STEPS_2_FUNCTIONAL = """\ +This test validates that the tool actually works by generating MIDI files, analyzing them, \ +and testing refinement. Since you cannot listen to audio, you will use programmatic MIDI \ +analysis as a proxy for quality. + +1. Find the README under scenarios/chiptune_generator/ to understand how to run the tool. +2. Based on the README, install any required dependencies (pip install -r requirements.txt \ +or similar). + +--- GENERATION TEST --- + +3. Run the generate command with this description: + "a melancholy chiptune with a slow tempo, minor key, and a simple repeating melody" + Save the output to /project/test_output/melancholy.mid + Create the /project/test_output/ directory first if it does not exist. +4. Verify the MIDI file was created at the expected path. +5. Run the analyze command on /project/test_output/melancholy.mid: + - Verify the analyze command runs without errors + - Check that the output includes: duration, tempo (BPM), channels used, note counts, \ +pitch ranges, velocities, and time signature + +6. A pre-built MIDI analysis script is available at /project/analyze_midi.py. + Install its dependency first: pip install mido + Run it on the melancholy MIDI file: + python /project/analyze_midi.py /project/test_output/melancholy.mid + It will print a JSON object with fields: total_duration_seconds, tempo_bpm, num_tracks, \ +channels_used, per_channel_note_count, per_channel_pitch_range, per_channel_avg_velocity, \ +total_note_count, and programs_used. +7. Examine the JSON output from the analysis script. Verify: + - The MIDI file has a duration between 30 and 120 seconds + - The tempo is reasonable for a "slow melancholy" description (roughly 60-100 BPM) + - At least 2 channels contain notes + - At least one channel uses a chiptune-style MIDI program number (synth lead programs \ +in the 80-87 range, or other retro-sounding instruments rather than piano/strings/brass) + - Total note count is at least 50 (enough for a recognizable melody) + - Notes show pitch and velocity variation (not all identical values) + +--- CONTRASTING GENERATION TEST --- + +8. Run generate a second time with a contrasting description: + "a fast energetic 8-bit boss battle theme with rapid arpeggios and driving percussion" + Save to /project/test_output/boss_battle.mid +9. Run the analysis script on boss_battle.mid and compare to melancholy.mid: + python /project/analyze_midi.py /project/test_output/boss_battle.mid + - The boss battle tune should have a higher tempo (BPM) than the melancholy tune + - The boss battle tune should have higher note density (more total notes or shorter \ +duration per note) than the melancholy tune + - This verifies the LLM interpretation actually controls the musical output + +--- REFINEMENT TEST --- + +10. Generate a base tune: + "a medium-tempo chiptune march with steady bass and a simple melody" + Save to /project/test_output/march_v1.mid +11. Run the analysis script on march_v1.mid and save the JSON output for comparison: + python /project/analyze_midi.py /project/test_output/march_v1.mid +12. Run the refine command on march_v1.mid with this feedback: + "make the bass line much more prominent with higher velocity and add more bass notes" + Save to /project/test_output/march_v2.mid +13. Run the analysis script on march_v2.mid and compare to march_v1.mid: + python /project/analyze_midi.py /project/test_output/march_v2.mid + - Identify the bass channel (the channel with the lowest average pitch range, or channel 3) + - The bass channel should have HIGHER average velocity in v2 compared to v1 + - The bass channel should have MORE notes in v2 compared to v1 + - The overall duration should be similar (within 50% of the original) +14. Run the refine command on march_v1.mid with a different instruction: + "double the tempo and make it much faster" + Save to /project/test_output/march_v3.mid +15. Run the analysis script on march_v3.mid and compare to march_v1.mid: + python /project/analyze_midi.py /project/test_output/march_v3.mid + - The tempo in v3 should be noticeably higher than v1 (at least 30% faster) + - The file should still be a valid MIDI with notes +16. Verify that refinement is not just regeneration: compare march_v1.mid and march_v2.mid. \ +Non-bass channels should have similar note counts (within 50% of the original), indicating \ +the refinement preserved aspects that were not asked to change.""" + +RUBRIC_2_FUNCTIONAL = { + "generate_runs": ( + "str - (10 points) Does the generate command run without errors and produce " + "a MIDI file for the melancholy description?" + ), + "midi_file_created": ("str - (5 points) Is a valid MIDI file created at the expected path?"), + "analyze_works": ( + "str - (10 points) Does the analyze command run and produce a readable report " + "with duration, tempo, channels, note counts, pitch ranges, and velocities?" + ), + "valid_duration": ("str - (5 points) Is the MIDI file duration between 30 and 120 seconds?"), + "appropriate_tempo": ( + "str - (5 points) Is the tempo reasonable for the slow melancholy description (roughly 60-100 BPM)?" + ), + "multiple_channels": ("str - (5 points) Does the MIDI use at least 2 channels with notes?"), + "chiptune_programs": ( + "str - (5 points) Does at least one channel use a chiptune-style MIDI program " + "(synth lead programs in the 80-87 range, or other retro-sounding instruments " + "rather than piano/strings/brass)?" + ), + "note_variety": ("str - (5 points) Do notes show pitch and velocity variation (not all identical)?"), + "second_gen_different": ( + "str - (10 points) Is the boss battle tune measurably different from the melancholy " + "tune? The boss battle should have higher tempo and/or higher note density." + ), + "refine_runs": ( + "str - (5 points) Does the refine command run without errors when given an existing " + "MIDI file and modification instructions?" + ), + "bass_velocity_increased": ( + "str - (10 points) After requesting 'make the bass line more prominent with higher " + "velocity', did the bass channel's average velocity increase compared to the original?" + ), + "bass_notes_increased": ( + "str - (10 points) After requesting 'add more bass notes', did the bass channel's " + "note count increase compared to the original?" + ), + "tempo_refinement": ( + "str - (10 points) After requesting 'double the tempo', is the tempo in the refined " + "version at least 30% faster than the original?" + ), + "refinement_preserves_structure": ( + "str - (5 points) Does refinement preserve aspects not asked to change? Non-bass " + "channels should have similar note counts between the original and bass-refined version." + ), + "score": ( + "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion." + ), +} + + +@click.command() +@click.option( + "--test-id", + default=lambda: get_test_id_from_env_or_default("dev"), + help="Test ID for result file naming (defaults to EVAL_RECIPES_TEST_ID env var)", +) +@click.option( + "--output-dir", + type=click.Path(path_type=Path), + default=lambda: Path(__file__).parents[0], + help="Directory to write result file", +) +@click.option( + "--instructions-file", + type=click.Path(path_type=Path), + default=None, + help="Path to instructions file (defaults to ./instructions.txt in working directory)", +) +def main(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + """Test script for chiptune_generator task.""" + return asyncio.run(run_test(test_id, output_dir, instructions_file)) + + +async def run_test(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) + + try: + logger.info("Running semantic test 1: Checking architecture and implementation...") + result_1 = await semantic_test( + steps=STEPS_1_ARCHITECTURE, + rubric=RUBRIC_1_ARCHITECTURE, + context=instructions, + working_dir=Path("/project"), + ) + + logger.info("Running semantic test 2: Functional testing (generation, analysis, refinement)...") + result_2 = await semantic_test( + steps=STEPS_2_FUNCTIONAL, + rubric=RUBRIC_2_FUNCTIONAL, + context=instructions, + working_dir=Path("/project"), + ) + + # Weights: architecture (35%), functional (65%) + final_score = result_1.score * 0.35 + result_2.score * 0.65 + + metadata = { + "instructions": instructions, + "semantic_test_1_architecture": { + "score": result_1.score, + "details": result_1.metadata, + }, + "semantic_test_2_functional": { + "score": result_2.score, + "details": result_2.metadata, + }, + "final_score": final_score, + "scoring_weights": { + "architecture": "35%", + "functional": "65%", + }, + } + + write_test_result(output_dir, test_id, final_score, metadata) + logger.info(f"Test completed with final score: {final_score:.1f}/100") + return 0 + + except Exception as e: + logger.error(f"Test failed with exception: {e}") + metadata = { + "instructions": instructions, + "error": str(e), + } + write_test_result(output_dir, test_id, 0, metadata) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data/tasks/chiptune_generator/test_time_data/analyze_midi.py b/data/tasks/chiptune_generator/test_time_data/analyze_midi.py new file mode 100644 index 0000000..09f78cb --- /dev/null +++ b/data/tasks/chiptune_generator/test_time_data/analyze_midi.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Standalone MIDI analysis script for chiptune_generator benchmark evaluation. + +Parses a MIDI file and prints a JSON report to stdout with musical properties +that can be used to programmatically evaluate generated chip tunes. + +Usage: + python analyze_midi.py +""" + +import json +import subprocess +import sys + +try: + import mido +except ImportError: + subprocess.check_call([sys.executable, "-m", "pip", "install", "mido"], stdout=subprocess.DEVNULL) + import mido + + +def analyze_midi(file_path: str) -> dict: + mid = mido.MidiFile(file_path) + + tempo_bpm = 120 # MIDI default + channels_used: set[int] = set() + note_counts: dict[int, int] = {} + pitch_ranges: dict[int, list[int]] = {} # channel -> [min, max] + velocity_sums: dict[int, float] = {} + programs: dict[int, int] = {} + + for track in mid.tracks: + for msg in track: + if msg.type == "set_tempo": + tempo_bpm = round(mido.tempo2bpm(msg.tempo)) + elif msg.type == "program_change": + programs[msg.channel] = msg.program + elif msg.type == "note_on" and msg.velocity > 0: + ch = msg.channel + channels_used.add(ch) + note_counts[ch] = note_counts.get(ch, 0) + 1 + velocity_sums[ch] = velocity_sums.get(ch, 0.0) + msg.velocity + if ch not in pitch_ranges: + pitch_ranges[ch] = [msg.note, msg.note] + else: + pitch_ranges[ch][0] = min(pitch_ranges[ch][0], msg.note) + pitch_ranges[ch][1] = max(pitch_ranges[ch][1], msg.note) + + total_note_count = sum(note_counts.values()) + per_channel_avg_velocity = { + str(ch): round(velocity_sums[ch] / note_counts[ch], 2) for ch in note_counts if note_counts[ch] > 0 + } + + return { + "total_duration_seconds": round(mid.length, 2), + "tempo_bpm": tempo_bpm, + "num_tracks": len(mid.tracks), + "channels_used": sorted(channels_used), + "per_channel_note_count": {str(ch): count for ch, count in sorted(note_counts.items())}, + "per_channel_pitch_range": {str(ch): rng for ch, rng in sorted(pitch_ranges.items())}, + "per_channel_avg_velocity": {str(ch): per_channel_avg_velocity[str(ch)] for ch in sorted(note_counts)}, + "total_note_count": total_note_count, + "programs_used": {str(ch): prog for ch, prog in sorted(programs.items())}, + } + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + result = analyze_midi(sys.argv[1]) + print(json.dumps(result, indent=2)) diff --git a/data/tasks/cpsc_recall_monitor/task.yaml b/data/tasks/cpsc_recall_monitor/task.yaml index a071c9c..3c16b14 100644 --- a/data/tasks/cpsc_recall_monitor/task.yaml +++ b/data/tasks/cpsc_recall_monitor/task.yaml @@ -4,13 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - web_scraping - - data_extraction + - web evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I need a tool that monitors CPSC consumer product recalls. It should take a month (like "January 2025" or "2025-01") and create a CSV file cataloging all recalled products for that month. diff --git a/data/tasks/cpsc_recall_monitor/test.py b/data/tasks/cpsc_recall_monitor/test.py index 9a9e735..22a0d4a 100644 --- a/data/tasks/cpsc_recall_monitor/test.py +++ b/data/tasks/cpsc_recall_monitor/test.py @@ -14,71 +14,44 @@ write_test_result, ) -# Define Semantic Test 1: Run Tool and Validate CSV Output Structure - -STEPS_1_RUN_AND_VALIDATE_CSV = """1. Find and read the README to understand how to run the CPSC recall tool. +STEPS = """\ +1. Find and read the README to understand how to run the CPSC recall tool. 2. Based on the README, install any required dependencies if needed. -3. Determine the correct command to run the tool with a recent month (use "January 2025" or "2025-01" as the test input). -4. Run the tool. This may take several minutes as it needs to fetch data from CPSC. - - If the tool fails to run or produces errors, note the specific error and assign a score of 0 for tool execution. -5. After the tool completes, locate the output CSV file: - - Check if a CSV file was created - - Verify the file is in the expected location (as described in README or instructions) -6. Open and examine the CSV file structure: +3. Run the tool with "December 2024" (or "2024-12"). This may take several minutes as it needs to \ +fetch data from CPSC. + - If the tool fails to run or produces errors, note the specific error and assign a score of 0. +4. After the tool completes, locate the output CSV file and examine its structure: - Does it have headers? - - Are the required columns present: product name, recall date, manufacturer, hazard description, recall URL? - - Is the CSV properly formatted (valid CSV syntax)? - - Can you parse it without errors? -7. Check data completeness: - - Are there any records in the CSV? - - Are critical fields populated (not all empty)? - - Do the recall dates correspond to the requested month?""" - -RUBRIC_1_RUN_AND_VALIDATE_CSV = { - "readme_clear": "str - (5 points) Is the README clear about how to run the tool?", - "tool_runs_successfully": "str - (25 points) Does the tool run without errors when given a valid month?", - "csv_file_created": "str - (15 points) Is a CSV file created as output?", - "csv_has_headers": "str - (10 points) Does the CSV have proper column headers?", - "required_columns_present": "str - (20 points) Are all required columns present (product name, recall date, manufacturer, hazard, URL)?", - "csv_properly_formatted": "str - (10 points) Is the CSV properly formatted and parseable?", - "csv_has_data": "str - (10 points) Does the CSV contain recall records (not empty)?", - "critical_fields_populated": "str - (5 points) Are critical fields like product name and hazard populated?", - "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", -} - - -# Define Semantic Test 2: Data Quality and Accuracy Validation - -STEPS_2_DATA_QUALITY = """1. Find and read the README to understand how to run the tool. -2. Run the tool with a specific month: "December 2024" (or "2024-12"). -3. After the tool completes, locate and open the generated CSV file. -4. Validate data quality and accuracy: - - Check that recall dates in the CSV are actually from December 2024 - - Verify that URLs are valid CPSC recall URLs (should contain "cpsc.gov" or similar official CPSC domain) - - Examine manufacturer fields - do they contain actual company names (not empty or generic)? - - Examine hazard descriptions - are they meaningful descriptions of actual hazards? - - Check product names - are they specific product descriptions? -5. Test URL validity: + - Are the required columns present: product name, recall date, manufacturer, hazard description, \ +and recall URL? + - Is the CSV properly formatted (valid CSV syntax, parseable without errors)? + - Does it contain recall records (not empty)? +5. Validate data quality: + - Check that recall dates are actually from December 2024 + - Verify URLs are valid CPSC recall URLs (should contain "cpsc.gov" or similar official CPSC domain) + - Check that manufacturer fields contain actual company names (not empty or generic) + - Check that hazard descriptions are meaningful and specific + - Check that product names are specific product descriptions +6. Test URL validity: - Pick 2-3 random URLs from the CSV - - Make HTTP requests to verify these URLs are actually accessible (return 200/300-level responses) - - Award full points if the tested URLs work, partial/no points if they fail -6. Test with an alternative month format to verify robustness: + - Make HTTP requests to verify they are actually accessible (return 200/300-level responses) + - If accessible, spot-check that the information in the CSV matches what's on the CPSC website +7. Test alternative month format: - Run the tool again with "2024-11" (November 2024) - - Verify it handles the different format correctly - - Check that the output CSV contains recalls from November 2024 -7. Validate that the tool fetches real CPSC data: - - For the 2-3 URLs tested above, if accessible, check if the information in the CSV matches what's on the CPSC website""" + - Verify it handles the different format correctly and outputs recalls from November 2024""" -RUBRIC_2_DATA_QUALITY = { - "tool_runs_successfully": "str - (10 points) Does the tool run successfully with the test month?", - "recall_dates_correct_month": "str - (20 points) Are the recall dates actually from the requested month (December 2024)?", - "urls_are_valid_cpsc": "str - (10 points) Are the URLs valid CPSC recall URLs (proper format, contain cpsc.gov)?", - "urls_actually_work": "str - (15 points) When testing 2-3 random URLs with HTTP requests, are they actually accessible?", - "manufacturers_meaningful": "str - (10 points) Do manufacturer fields contain actual company names?", - "hazards_meaningful": "str - (10 points) Are hazard descriptions meaningful and specific?", - "products_specific": "str - (10 points) Are product names specific and descriptive?", - "alternative_format_works": "str - (10 points) Does the tool work with alternative month format (2024-11)?", - "data_matches_cpsc_website": "str - (5 points) When spot-checking accessible URLs, does the CSV data match the actual CPSC pages?", +RUBRIC = { + "readme_clear": "str - (5 points) Is the README clear about how to run the tool?", + "tool_runs_successfully": "str - (20 points) Does the tool run without errors when given 'December 2024'?", + "csv_file_created": "str - (10 points) Is a CSV file created as output?", + "required_columns_present": "str - (10 points) Are all required columns present (product name, recall date, manufacturer, hazard, URL) with proper headers?", + "csv_properly_formatted": "str - (5 points) Is the CSV properly formatted and parseable?", + "csv_has_data": "str - (5 points) Does the CSV contain recall records with critical fields populated?", + "recall_dates_correct_month": "str - (10 points) Are the recall dates actually from the requested month (December 2024)?", + "urls_are_valid_cpsc": "str - (5 points) Are the URLs valid CPSC recall URLs (proper format, contain cpsc.gov)?", + "urls_actually_work": "str - (10 points) When testing 2-3 URLs with HTTP requests, are they accessible and does the CSV data match the CPSC pages?", + "data_fields_meaningful": "str - (10 points) Are manufacturer names, hazard descriptions, and product names meaningful and specific (not empty or generic)?", + "alternative_format_works": "str - (10 points) Does the tool work with alternative month format ('2024-11') and produce recalls from November 2024?", "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } @@ -110,45 +83,21 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) try: - logger.info("Running semantic test 1: Running tool and validating CSV output structure...") - result_1 = await semantic_test( - steps=STEPS_1_RUN_AND_VALIDATE_CSV, - rubric=RUBRIC_1_RUN_AND_VALIDATE_CSV, + logger.info("Running semantic test: Validating tool output and data quality...") + result = await semantic_test( + steps=STEPS, + rubric=RUBRIC, context=instructions, working_dir=Path("/project"), ) - logger.info("Running semantic test 2: Validating data quality and accuracy...") - result_2 = await semantic_test( - steps=STEPS_2_DATA_QUALITY, - rubric=RUBRIC_2_DATA_QUALITY, - context=instructions, - working_dir=Path("/project"), - ) - - # Calculate final score with weighted average - # Weights: CSV output validation (50%), data quality (50%) - final_score = result_1.score * 0.50 + result_2.score * 0.50 - metadata = { "instructions": instructions, - "semantic_test_1_csv_output": { - "score": result_1.score, - "details": result_1.metadata, - }, - "semantic_test_2_data_quality": { - "score": result_2.score, - "details": result_2.metadata, - }, - "final_score": final_score, - "scoring_weights": { - "csv_output_validation": "50%", - "data_quality": "50%", - }, + "semantic_test_result": result.metadata, } - write_test_result(output_dir, test_id, final_score, metadata) - logger.info(f"Test completed with final score: {final_score:.1f}/100") + write_test_result(output_dir, test_id, result.score, metadata) + logger.info(f"Test completed with final score: {result.score:.1f}/100") return 0 except Exception as e: diff --git a/data/tasks/cross_repo_improvement_tool/task.yaml b/data/tasks/cross_repo_improvement_tool/task.yaml index 2d7234c..89657f0 100644 --- a/data/tasks/cross_repo_improvement_tool/task.yaml +++ b/data/tasks/cross_repo_improvement_tool/task.yaml @@ -4,12 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - software_engineering_tools + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I would like a new tool that uses repomix (`npx repomix@latest`, check it out) to roll up a "source" local or remote repo and also a "target" repo and any additional requests I want to pass along with the input. The tool should then take all that and deeply analyze the relevant parts of both diff --git a/data/tasks/cross_repo_improvement_tool/test.py b/data/tasks/cross_repo_improvement_tool/test.py index 5e685eb..069565f 100644 --- a/data/tasks/cross_repo_improvement_tool/test.py +++ b/data/tasks/cross_repo_improvement_tool/test.py @@ -47,9 +47,9 @@ } -# Define Semantic Test 2: Run Tool & Validate Output Structure +# Define Semantic Test 2: Run Tool & Validate Output -STEPS_2_RUN_AND_VALIDATE_STRUCTURE = f"""1. Find the README file that explains how to use the cross-repo improvement tool. +STEPS_2_RUN_AND_VALIDATE = f"""1. Find the README file that explains how to use the cross-repo improvement tool. 2. Based on the README, determine the correct command to run the tool with these parameters: - Source repo: {SOURCE_REPO} - Target repo: {TARGET_REPO} @@ -71,57 +71,38 @@ * Sufficient detail for handoff to a team without source access - Check if file/directory names are based on the input context (repos and request) - Look for evidence of review iterations (feedback files, iteration logs, etc.) -6. Verify the tool completed successfully and produced organized, structured output.""" - -RUBRIC_2_RUN_AND_VALIDATE_STRUCTURE = { - "readme_exists": "str - (5 points) Does a comprehensive README exist with clear usage instructions?", - "tool_runs_successfully": "str - (20 points) Does the tool run without fatal errors?", - "repomix_execution": "str - (10 points) Is there evidence the tool ran repomix on both repositories?", - "comprehensive_analysis_file": "str - (15 points) Does output include comprehensive analysis with high-level to detailed opportunity breakdown and rationale?", - "separate_proposal_files": "str - (15 points) Are individual opportunities documented in separate detailed proposal files?", - "contextual_naming": "str - (10 points) Are output directories and files named based on the repos and request context?", - "implementation_guidance": "str - (15 points) Do proposals include detailed implementation guidance suitable for handoff?", - "review_iteration_evidence": "str - (10 points) Is there evidence of feedback loop iterations (feedback files, version history, iteration logs)?", - "score": "float - Score between 0 and 100. Sum the points earned from each criterion above.", -} - - -# Define Semantic Test 3: Deep Quality Validation - -STEPS_3_DEEP_QUALITY_VALIDATION = f"""1. Find the README file that explains how to use the cross-repo improvement tool. -2. Based on the README, determine the correct command to run the tool with these parameters: - - Source repo: {SOURCE_REPO} - - Target repo: {TARGET_REPO} - - Improvement request: "{IMPROVEMENT_REQUEST}" -3. Run the command. This will take 10-20 minutes as it runs repomix and performs multi-phase analysis. -4. After completion, deeply examine the comprehensive analysis file: +6. Deeply examine the comprehensive analysis file for quality: - Are the identified opportunities actually present in github/spec-kit's codebase/patterns? - Do the opportunities make sense for github/copilot-cli's purpose and architecture? - Is the analysis grounded in actual code, patterns, and design from both repos (not generic advice)? - Does it demonstrate deep understanding of both tools' purposes and design philosophies? -5. Select 2-3 individual proposal files and analyze them deeply: +7. Select 2-3 individual proposal files and analyze them deeply: - Specificity: Do proposals reference actual code, patterns, or structures from spec-kit? - Applicability: Are suggestions realistic and appropriate for copilot-cli's architecture? - Implementation guidance: Is the guidance specific enough for a team to implement? - Completeness: Could a team implement these proposals without accessing spec-kit's source code? - Value: Would these improvements genuinely benefit copilot-cli users and align with its goals? -6. Check for review quality evidence: +8. Check for review quality evidence: - Are proposals grounded to both repos (not generic "best practices")? - Do suggestions align with copilot-cli's existing patterns and philosophy? - Is there evidence that multiple reviewers influenced the final quality? -7. Overall assessment: +9. Overall assessment: - Would the GitHub team maintaining copilot-cli find this analysis valuable? - - Are suggestions actionable, well-reasoned, and worth considering? - - Does the analysis justify the tool's complex multi-reviewer feedback loop architecture?""" - -RUBRIC_3_DEEP_QUALITY_VALIDATION = { - "opportunities_grounded_in_source": "str - (20 points) Do opportunities reference actual patterns/code from spec-kit (not generic suggestions)?", - "appropriate_for_target": "str - (15 points) Do suggestions make sense for copilot-cli's purpose, architecture, and user experience?", - "deep_understanding": "str - (15 points) Does analysis demonstrate deep understanding of both tools' designs and philosophies?", - "proposal_specificity": "str - (15 points) Do proposals include specific references to source patterns rather than generic advice?", - "implementation_completeness": "str - (15 points) Is implementation guidance detailed enough for handoff without requiring source access?", - "actionability": "str - (10 points) Are suggestions realistic, actionable, and well-reasoned?", - "overall_value": "str - (10 points) Would the GitHub copilot-cli team find this analysis genuinely valuable?", + - Are suggestions actionable, well-reasoned, and worth considering?""" + +RUBRIC_2_RUN_AND_VALIDATE = { + "readme_exists": "str - (5 points) Does a comprehensive README exist with clear usage instructions?", + "tool_runs_successfully": "str - (10 points) Does the tool run without fatal errors?", + "repomix_execution": "str - (5 points) Is there evidence the tool ran repomix on both repositories?", + "comprehensive_analysis_file": "str - (10 points) Does output include comprehensive analysis with high-level to detailed opportunity breakdown and rationale?", + "separate_proposal_files": "str - (10 points) Are individual opportunities documented in separate detailed proposal files?", + "contextual_naming": "str - (5 points) Are output directories and files named based on the repos and request context?", + "review_iteration_evidence": "str - (5 points) Is there evidence of feedback loop iterations (feedback files, version history, iteration logs)?", + "opportunities_grounded_in_source": "str - (15 points) Do opportunities reference actual patterns/code from spec-kit (not generic suggestions)?", + "appropriate_for_target": "str - (10 points) Do suggestions make sense for copilot-cli's purpose, architecture, and user experience?", + "deep_understanding": "str - (5 points) Does analysis demonstrate deep understanding of both tools' designs and philosophies?", + "proposal_specificity_and_guidance": "str - (10 points) Do proposals include specific references to source patterns with implementation guidance detailed enough for handoff without requiring source access?", + "actionability_and_value": "str - (10 points) Are suggestions realistic, actionable, well-reasoned, and genuinely valuable to the copilot-cli team?", "score": "float - Score between 0 and 100. Sum the points earned from each criterion above.", } @@ -161,25 +142,16 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non working_dir=Path("/project"), ) - logger.info("Running semantic test 2: Run Tool & Validate Output Structure...") + logger.info("Running semantic test 2: Run Tool & Validate Output...") result_2 = await semantic_test( - steps=STEPS_2_RUN_AND_VALIDATE_STRUCTURE, - rubric=RUBRIC_2_RUN_AND_VALIDATE_STRUCTURE, - context=instructions, - working_dir=Path("/project"), - ) - - logger.info("Running semantic test 3: Deep Quality Validation...") - result_3 = await semantic_test( - steps=STEPS_3_DEEP_QUALITY_VALIDATION, - rubric=RUBRIC_3_DEEP_QUALITY_VALIDATION, + steps=STEPS_2_RUN_AND_VALIDATE, + rubric=RUBRIC_2_RUN_AND_VALIDATE, context=instructions, working_dir=Path("/project"), ) # Calculate final score as weighted average - # Weights: architecture (20%), structure (35%), quality (45%) - final_score = result_1.score * 0.20 + result_2.score * 0.35 + result_3.score * 0.45 + final_score = result_1.score * 0.30 + result_2.score * 0.70 metadata = { "instructions": instructions, @@ -192,19 +164,14 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non "score": result_1.score, "details": result_1.metadata, }, - "semantic_test_2_run_and_validate_structure": { + "semantic_test_2_run_and_validate": { "score": result_2.score, "details": result_2.metadata, }, - "semantic_test_3_deep_quality_validation": { - "score": result_3.score, - "details": result_3.metadata, - }, "final_score": final_score, "scoring_weights": { - "architecture_review": "20%", - "run_and_validate_structure": "35%", - "deep_quality_validation": "45%", + "architecture_review": "30%", + "run_and_validate": "70%", }, } diff --git a/data/tasks/email_drafting/task.yaml b/data/tasks/email_drafting/task.yaml index 8b69fb2..155f312 100644 --- a/data/tasks/email_drafting/task.yaml +++ b/data/tasks/email_drafting/task.yaml @@ -4,12 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - writing + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | Create me a CLI tool that will take bullet points or rough notes about what I want to communicate and draft an email in my communication style. Place all the code under the directory scenarios/email_drafting. @@ -28,6 +27,7 @@ instructions: | Once it all passes, write the final version out for me to review. Give me the opportunity to mark up the email with [bracket-enclosed-comments] and then pass it back to the tool to incorporate my feedback as the final reviewer - start back with the writer and then review again with the other reviewers, including passing my feedback along with the other context they previously had. Finally, create a README.md under scenarios/email_drafting/ that gives examples of each of the commands that can be used to run the tool. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM API calls you need to make. test_time_files: - source: test_time_data dest: /project diff --git a/data/tasks/email_drafting/test.py b/data/tasks/email_drafting/test.py index 144340d..c8af476 100644 --- a/data/tasks/email_drafting/test.py +++ b/data/tasks/email_drafting/test.py @@ -14,30 +14,48 @@ write_test_result, ) -# Define Semantic Test 1 - Agent SDK Check - -AGENT_SDK_DEFINITION = """The solution should use an Agent SDK, such as Claude Agent/Code SDK, Microsoft Agent Framework, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI Codex CLI, or others that are similarly capable. These SDKs must have the following functionality: -- Automatic Context Management to ensure your agent doesn't run out of context. -- Rich tool ecosystem: File operations, code execution, web search, and MCP extensibility -- Excels at code generation and effectively gives the agent a "computer" where it can find appropriate files, write and edit files, lint the code, run it, debug, edit, and sometimes take these actions iteratively until it succeeds. -- APIs like OpenAI's chat completions or Responses API, Anthropic's Messages API, or Azure OpenAI alone are NOT sufficient and should not recieve any credit.""" - -STEPS_1_AGENT_SDK_DEPENDENCY = f"""{AGENT_SDK_DEFINITION} - -1. Explore the code that the agent generated in this project directory to see if it uses an Agent SDK. -2. Look for where dependencies are defined (e.g., pyproject.toml, requirements.txt, package.json, etc.) -3. Check which dependencies are being imported and actually used in the code to create the email drafting solution. -4. Verify the SDK provides the required agent capabilities: automatic context management, rich tool ecosystem (file operations, code execution), and iterative code generation/debugging capabilities. -5. Confirm it is NOT just a plain API client (like openai.OpenAI() or anthropic.Anthropic() without agent features).""" - -RUBRIC_1_AGENT_SDK_DEPENDENCY = { - "agent_sdk_identified": "str - Name of the Agent SDK found (e.g., 'Claude Agent SDK', 'Microsoft Agent Framework', 'Amplifier', 'OpenAI Codex CLI', 'None')", - "has_context_management": "bool - Does it provide automatic context management?", - "has_tool_ecosystem": "bool - Does it provide rich tool ecosystem (file operations, code execution, web search)?", - "has_iterative_capabilities": "bool - Can it iteratively write, run, debug, and edit code?", - "evidence": "str - Brief description of evidence found (dependency files, imports, usage patterns)", - "justification": "str - Detailed explanation of whether this qualifies as an Agent SDK based on the criteria", - "score": "float - Score 100 if the solution uses a qualifying Agent SDK with all required capabilities. Score 0 if it uses plain API clients or lacks required capabilities.", +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks for each stage below.""" + +STEPS_1_LLM_PIPELINE = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code under scenarios/email_drafting/ to understand the full implementation. +2. Check that it reads past emails from a directory and source notes/bullet points from a file. +3. Check for a **writer** stage that uses an LLM to draft an email based on the notes and past email style. +4. Check for a **content-reviewer** stage that uses an LLM to verify all key points from the notes are covered \ +and nothing important was missed or misrepresented. +5. Check for a **tone-reviewer** stage that uses an LLM to verify the draft matches the user's communication \ +style, formality level, and typical email patterns from past emails. +6. Check for a **brevity-reviewer** stage that uses an LLM to verify email length is appropriate for the \ +selected mode and is not unnecessarily verbose. +7. Check for revision logic: if any reviewer finds issues, the draft should go back to the writer and through \ +the reviewers again. +8. Check for a mode flag (--mode concise|standard|detailed) that controls email length. +9. Check for an optional recipient parameter that prioritizes past emails to/from that specific recipient. +10. Check for a feedback loop: the user can mark up the draft with [bracket-enclosed-comments] and pass it \ +back to the tool to incorporate feedback, restarting the write-and-review cycle. +11. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK is used, \ +check which model it is configured with. Check the model identifier in the code against these references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_LLM_PIPELINE = { + "reads_past_emails": "str - (5 points) Does the code read past emails from a directory?", + "reads_source_notes": "str - (5 points) Does the code read source notes/bullet points from a file?", + "writer_uses_llm": "str - (10 points) Is there a writer stage that uses an LLM to draft an email matching the user's style? Agent SDKs with built-in LLM capabilities count.", + "content_reviewer_uses_llm": "str - (10 points) Is there a content-reviewer stage that uses an LLM to verify key points are covered? Agent SDKs with built-in LLM capabilities count.", + "tone_reviewer_uses_llm": "str - (10 points) Is there a tone-reviewer stage that uses an LLM to verify the draft matches communication style? Agent SDKs with built-in LLM capabilities count.", + "brevity_reviewer_uses_llm": "str - (5 points) Is there a brevity-reviewer stage that uses an LLM to verify appropriate length for the selected mode? Agent SDKs with built-in LLM capabilities count.", + "revision_loop": "str - (10 points) If a reviewer finds issues, does the draft go back to the writer and through reviewers again?", + "mode_flag": "str - (5 points) Is there a mode flag (concise/standard/detailed) controlling email length?", + "recipient_parameter": "str - (5 points) Is there an optional recipient parameter that prioritizes matching past emails?", + "feedback_loop": "str - (5 points) Can the user mark up the draft with [bracket-enclosed-comments] and pass it back?", + "uses_recent_model": "str - (25 points) Does it use a recent model from Anthropic (see https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI (see https://platform.openai.com/docs/models)? If an Agent SDK is used, check which model it is configured with. 5 points partial credit if a model is used but it is not recent.", + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } # Define Semantic Test 2 @@ -91,10 +109,10 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) try: - logger.info("Running semantic test 1 to check for Agent SDK dependency...") + logger.info("Running semantic test 1 to verify LLM pipeline implementation...") result_1 = await semantic_test( - steps=STEPS_1_AGENT_SDK_DEPENDENCY, - rubric=RUBRIC_1_AGENT_SDK_DEPENDENCY, + steps=STEPS_1_LLM_PIPELINE, + rubric=RUBRIC_1_LLM_PIPELINE, context=instructions, working_dir=Path("/project"), ) diff --git a/data/tasks/energy_forecast_new_england/task.yaml b/data/tasks/energy_forecast_new_england/task.yaml new file mode 100644 index 0000000..c1b4599 --- /dev/null +++ b/data/tasks/energy_forecast_new_england/task.yaml @@ -0,0 +1,29 @@ +name: energy_forecast_new_england +task_info: + difficulty: hard + non_deterministic_evals: true + categories: + - knowledge_worker +evaluation_configs: + - type: score + test_script: test.py + test_command: uv run --no-project /project/test.py +instructions: | + Build a CLI tool that forecasts 24-hour electricity demand for the New England region. + + The tool should: + 1. Acquire historical electricity demand data for New England from publicly available sources. + 2. Train a machine learning model on the historical data to predict hourly electricity demand. + The training process should take at most 10 minutes on a 2026 workstation PC. + 3. Generate a 24-hour demand forecast starting from the next available hour. + 4. Save the forecast to a JSON file at /project/forecast.json with the following structure: + { + "forecast": [ + {"hour": "2025-01-15T00:00:00", "demand_mw": 12500.0}, + {"hour": "2025-01-15T01:00:00", "demand_mw": 12100.0}, + ... + ] + } + 5. Save the trained model's evaluation metrics and a brief report to /project/model_report.md that you would give to a load forecaster. + 6. Include a README.md at /project/README.md with instructions on how to run the tool, what dependencies are needed, and how to reproduce the forecast. +timeout: 4500 diff --git a/data/tasks/energy_forecast_new_england/test.py b/data/tasks/energy_forecast_new_england/test.py new file mode 100644 index 0000000..2594cd9 --- /dev/null +++ b/data/tasks/energy_forecast_new_england/test.py @@ -0,0 +1,235 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from pathlib import Path +import sys + +import click +from loguru import logger + +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( + get_instructions_from_file_or_default, + get_test_id_from_env_or_default, + write_test_result, +) + +# Define Semantic Test 1: Data Acquisition and ML Implementation Review + +STEPS_1_IMPLEMENTATION = """\ +1. Explore the /project directory to understand the full implementation. +2. Look for dependency files (pyproject.toml, requirements.txt, etc.) to understand what ML libraries \ +are used. +3. Check for data acquisition code: + - Does it download real historical energy data from a public source? + - What source does it use? Is it a legitimate public energy or government data source? + - How much data does it acquire? (At least 30 days of hourly data = 720+ data points) +4. Check the ML implementation: + - What algorithm is used? (Linear regression, random forest, gradient boosting, neural network, etc.) + - Is there feature engineering beyond just raw demand values? + - Look for time-based features (hour of day, day of week, month, holiday indicators) + - Look for lagged features (previous hours/days demand) + - Look for external features (temperature, weather data) +5. Check for model validation: + - Is there a train/test split or cross-validation? + - Are evaluation metrics computed (RMSE, MAE, MAPE, etc.)? +6. Check for a model_report.md that documents the approach. +7. Check for a README.md with usage instructions.""" + +RUBRIC_1_IMPLEMENTATION: dict[str, str] = { + "real_data_source": """\ +str - (15 points) Does the code download real historical energy data from a legitimate public source? \ +Verify the data source URL or API is real. Full credit for a regional grid operator's published data. \ +10 points for other legitimate energy or government data sources.""", + "sufficient_data_volume": """\ +str - (10 points) Does the code acquire at least 30 days of hourly data (720+ data points)? More data \ +is better. 10 points for 30+ days, 5 points for 7-30 days, 0 for less.""", + "ml_algorithm_quality": """\ +str - (25 points) What ML algorithm is used? 25 points for gradient boosting, ensemble methods, or \ +neural networks. 15 points for a single decision tree model or support vector regression. 5 points \ +for linear regression or trivial approaches. 0 if no ML model.""", + "feature_engineering": """\ +str - (20 points) Does the code perform meaningful feature engineering? Check for: time-based features \ +(hour, day of week, month) worth 8 points; lagged demand features worth 6 points; rolling statistics \ +(moving averages, etc.) worth 3 points; external data like weather worth 3 points.""", + "model_validation": """\ +str - (10 points) Is there model validation? 10 points for cross-validation or robust train/test \ +methodology with reported metrics. 5 points for a simple train/test split with metrics. 0 for no \ +validation.""", + "model_report_exists": """\ +str - (10 points) Does model_report.md exist with a meaningful description of the data source, ML \ +approach, feature engineering, and validation results?""", + "readme_exists": """\ +str - (10 points) Does a README.md exist with clear usage and installation instructions?""", + "score": """\ +float - Score between 0 and 100 based on the above criteria. Sum the points earned from each \ +criterion.""", +} + + +# Define Semantic Test 2: Execution and Output Validation + +STEPS_2_EXECUTION = """\ +1. Find the README.md to understand how to run the tool. +2. Based on the README, install any required dependencies. +3. Run the tool. This may take up to 20 minutes as it needs to download data and train a model. + - If the tool fails or errors out, note the error and assign a score of 0. +4. After completion, check that /project/forecast.json exists and is valid JSON. +5. Validate the forecast structure: + - Does it contain a "forecast" array with 24 entries? + - Does each entry have "hour" and "demand_mw" (numeric) fields? + - Are the hours sequential (1-hour increments)? +6. Check that /project/model_report.md was generated and contains meaningful content. +7. Verify demand values are in a reasonable range for New England""" + +RUBRIC_2_EXECUTION: dict[str, str] = { + "tool_runs_successfully": """\ +str - (30 points) Does the tool run without fatal errors and complete data download and model \ +training?""", + "forecast_json_exists": """\ +str - (10 points) Does /project/forecast.json exist and contain valid JSON?""", + "forecast_has_24_entries": """\ +str - (15 points) Does the forecast array contain exactly 24 entries with sequential hourly \ +timestamps?""", + "forecast_entry_structure": """\ +str - (10 points) Does each entry have 'hour' (ISO 8601 string) and 'demand_mw' (numeric) fields?""", + "demand_values_reasonable_range": """\ +str - (20 points) Are all 24 demand values within a plausible range for New England? Deduct points for values outside this range.""", + "model_report_generated": """\ +str - (15 points) Does /project/model_report.md exist with meaningful content about the approach and results?""", + "score": """\ +float - Score between 0 and 100 based on the above criteria. Sum the points earned from each \ +criterion.""", +} + + +# Define Semantic Test 3: Forecast Plausibility Check + +STEPS_3_PLAUSIBILITY = """\ +1. Read the forecast from /project/forecast.json. +2. Determine what date range the forecast covers from the "hour" timestamps. +3. Try to look up current/recent New England electricity demand from public sources to establish a baseline \ +for what demand should look like. +4. Look up current weather conditions and forecasts for the New England region to understand expected \ +temperatures, which strongly influence electricity demand. +5. Analyze the forecast for temporal patterns: + - Does demand show expected daily patterns (lower at night around 2-5 AM, higher during day)? + - Is there appropriate variation between hours (not flat or random)? + - Do the peak and trough values align with typical New England patterns?""" + +RUBRIC_3_PLAUSIBILITY: dict[str, str] = { + "daily_demand_pattern": """\ +str - (35 points) Does the forecast show a realistic daily demand curve? Demand should be lower during \ +nighttime hours (roughly 2-5 AM) and higher during daytime/evening. 35 points if the pattern is \ +clearly visible and realistic. 20 points if there is some variation but the pattern is weak. 0 if \ +demand is flat, random, or inversely patterned.""", + "hour_to_hour_variation": """\ +str - (25 points) Is there reasonable hour-to-hour variation (not all the same value, not wildly \ +jumping)? Adjacent hours should typically differ by less than 1,000 MW. 25 points for smooth, \ +realistic transitions. 12 points for somewhat reasonable. 0 for flat or erratic.""", + "weather_consistency": """\ +str - (20 points) Given current weather conditions and temperatures for the New England region, do \ +the forecast demand levels make sense? Higher temperatures in summer or lower temperatures in winter \ +should correspond to higher demand. 20 points if consistent with current conditions. 10 points if \ +partially consistent. 0 if clearly inconsistent.""", + "overall_forecast_credibility": """\ +str - (20 points) Overall assessment: would an energy analyst find this forecast credible as a rough \ +next-day forecast? Consider all the above factors holistically.""", + "score": """\ +float - Score between 0 and 100 based on the above criteria. Sum the points earned from each \ +criterion.""", +} + + +@click.command() +@click.option( + "--test-id", + default=lambda: get_test_id_from_env_or_default("dev"), + help="Test ID for result file naming (defaults to EVAL_RECIPES_TEST_ID env var)", +) +@click.option( + "--output-dir", + type=click.Path(path_type=Path), + default=lambda: Path(__file__).parents[0], + help="Directory to write result file", +) +@click.option( + "--instructions-file", + type=click.Path(path_type=Path), + default=None, + help="Path to instructions file (defaults to ./instructions.txt in working directory)", +) +def main(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + """Test script for energy_forecast_new_england task.""" + return asyncio.run(run_test(test_id, output_dir, instructions_file)) + + +async def run_test(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) + + try: + logger.info("Running semantic test 1: Data acquisition and ML implementation review...") + result_1 = await semantic_test( + steps=STEPS_1_IMPLEMENTATION, + rubric=RUBRIC_1_IMPLEMENTATION, + context=instructions, + working_dir=Path("/project"), + ) + + logger.info("Running semantic test 2: Execution and output validation...") + result_2 = await semantic_test( + steps=STEPS_2_EXECUTION, + rubric=RUBRIC_2_EXECUTION, + context=instructions, + working_dir=Path("/project"), + ) + + logger.info("Running semantic test 3: Forecast plausibility check...") + result_3 = await semantic_test( + steps=STEPS_3_PLAUSIBILITY, + rubric=RUBRIC_3_PLAUSIBILITY, + context=instructions, + working_dir=Path("/project"), + ) + + # Weights: implementation (25%), execution (35%), plausibility (40%) + final_score = result_1.score * 0.25 + result_2.score * 0.35 + result_3.score * 0.40 + + metadata = { + "instructions": instructions, + "semantic_test_1_implementation": { + "score": result_1.score, + "details": result_1.metadata, + }, + "semantic_test_2_execution": { + "score": result_2.score, + "details": result_2.metadata, + }, + "semantic_test_3_plausibility": { + "score": result_3.score, + "details": result_3.metadata, + }, + "final_score": final_score, + "scoring_weights": { + "implementation": "25%", + "execution": "35%", + "plausibility": "40%", + }, + } + + write_test_result(output_dir, test_id, final_score, metadata) + logger.info(f"Test completed with final score: {final_score:.1f}/100") + return 0 + + except Exception as e: + logger.error(f"Test failed with exception: {e}") + metadata = { + "instructions": instructions, + "error": str(e), + } + write_test_result(output_dir, test_id, 0, metadata) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data/tasks/gdpval_extraction/task.yaml b/data/tasks/gdpval_extraction/task.yaml index 416f9b0..ab01b04 100644 --- a/data/tasks/gdpval_extraction/task.yaml +++ b/data/tasks/gdpval_extraction/task.yaml @@ -3,13 +3,11 @@ task_info: difficulty: easy non_deterministic_evals: true categories: - - data_extraction - - research + - web evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py --csv /project/gdpval_prompts.csv --summary /project/gdpval_summary.txt -dockerfile_portion: "" instructions: | Go to https://huggingface.co/datasets/openai/gdpval and output to a file called "gdpval_prompts.csv" all of the prompts in the dataset in column called "prompt" Then, output to a file called "gdpval_summary.txt" a factual and accurate synthesis of the types of tasks in the dataset grouped into 5 or so categories with examples. Keep it relatively brief, each category to one paragraph of 5 or less sentences. diff --git a/data/tasks/git_changelog_generator/task.yaml b/data/tasks/git_changelog_generator/task.yaml new file mode 100644 index 0000000..781ccff --- /dev/null +++ b/data/tasks/git_changelog_generator/task.yaml @@ -0,0 +1,32 @@ +name: git_changelog_generator +task_info: + difficulty: medium + non_deterministic_evals: true + categories: + - cli_tool + - ai_tool +evaluation_configs: + - type: score + test_script: test.py + test_command: uv run --no-project /project/test.py +instructions: | + Build a CLI tool that generates a user-friendly, categorized changelog between two git commits or + tags in a repository. The tool should go beyond a raw git log by using an LLM to intelligently + categorize and summarize the changes into a polished markdown document. + + The tool should: + 1. Accept a git repository URL (or local path) and two commit references (tags, SHAs, or branch + names) as input via the command line. + 2. Use agents to produce a well-structured markdown changelog that categorizes and summarizes + the changes between the two references. The result should be something a developer or manager + could read to quickly understand what changed, not a raw git log. + 3. Save the output to a markdown file (the filename or output path should be configurable via a + CLI option, with a sensible default). + 4. Handle edge cases gracefully: invalid refs, empty commit ranges, network errors when cloning. + + Include a README.md with usage instructions, examples, and a description of how the tool works. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM API calls you need to make. +test_time_files: + - source: test_time_data + dest: /project +timeout: 5400 diff --git a/data/tasks/git_changelog_generator/test.py b/data/tasks/git_changelog_generator/test.py new file mode 100644 index 0000000..2c7fe0e --- /dev/null +++ b/data/tasks/git_changelog_generator/test.py @@ -0,0 +1,165 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from pathlib import Path +import sys + +import click +from loguru import logger + +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( + get_instructions_from_file_or_default, + get_test_id_from_env_or_default, + write_test_result, +) + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks below.""" + +# Define Semantic Test 1: Code Review and Architecture + +STEPS_1_CODE_REVIEW = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code in the project directory to understand the full implementation. +2. Check that the solution uses **git** operations (via subprocess, gitpython, or similar) to retrieve \ +commits between two references. Verify it actually runs git commands uses a git library to get the commit log, +or relies on the agent to make the git call (any of these are acceptable). +3. Check that the solution uses an **LLM to categorize and summarize** the commits into meaningful \ +groups (not just dumping raw commit messages). This could be via direct API calls or an Agent SDK. +4. Check for a **CLI interface** that accepts: + - A repository URL or local path + - Two commit references (tags, SHAs, or branches) + - An option to configure the output file path +5. Check that the output is a **markdown file** with: + - Categorized sections (e.g., Features, Bug Fixes, etc.) + - Summarized descriptions within each category +6. Check that a **README.md** exists with usage instructions and examples. +7. Check for reasonable **error handling** (invalid refs, empty ranges, network errors). +8. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK is used, \ +check which model it is configured with. Check the model identifier in the code against these references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_CODE_REVIEW = { + "git_operations": "str - (15 points) Does the solution use git operations (subprocess, gitpython, or similar) to retrieve commits between two references? Check for actual git commands or library usage in the code.", + "llm_categorization": "str - (15 points) Does the solution use an LLM to categorize and summarize commits into meaningful groups? It should not just dump raw commit messages. Agent SDKs with built-in LLM capabilities count.", + "cli_interface": "str - (10 points) Does the CLI accept a repository URL or local path, two commit references, and a configurable output path?", + "markdown_output": "str - (10 points) Does the code produce a markdown file with categorized sections and summarized descriptions?", + "readme_exists": "str - (10 points) Does a README.md exist with usage instructions and examples?", + "error_handling": "str - (5 points) Does the code handle edge cases like invalid refs, empty ranges, or network errors?", + "clean_architecture": "str - (10 points) Is the code well-structured with separation between git operations, LLM processing, and output formatting?", + "uses_recent_model": "str - (25 points) Does it use a recent model from Anthropic (see https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI (see https://platform.openai.com/docs/models)? If an Agent SDK is used, check which model it is configured with. 5 points partial credit if a model is used but it is not recent.", + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", +} + +# Define Semantic Test 2: Functional Test Against microsoft/playwright + +STEPS_2_FUNCTIONAL = """\ +1. Find the README.md file that explains how to use the tool and what commands to run. +2. Based on the README, install any required dependencies if needed. +3. Run the tool against the microsoft/playwright repository with the range v1.45.0 to v1.47.0. \ +The command should clone or reference the repo and generate a changelog between these two tags. + - If the tool fails or errors out, note an overall score of 0 for this test. + - This may take up to 20 minutes as it needs to clone the repo and process commits through an LLM. +4. After the tool completes, locate the generated markdown changelog file. +5. Read the official Playwright release notes that serve as ground truth for what actually changed: + - /project/test_time_data/playwright_v1.46.0_release_notes.md (changes from v1.45.0 to v1.46.0) + - /project/test_time_data/playwright_v1.47.0_release_notes.md (changes from v1.46.0 to v1.47.0) +6. Compare the generated changelog against the official release notes. Evaluate how well the \ +generated changelog captures the major features, changes, and fixes documented in the official \ +notes. The changelog does not need to match the official notes word-for-word, but the key themes \ +and changes should be represented. +7. Check for content that is fabricated or hallucinated: does the generated changelog contain \ +significant features or changes that are NOT in the official release notes or actual commits?""" + +RUBRIC_2_FUNCTIONAL = { + "tool_runs_successfully": "str - (20 points) Does the tool run without errors against microsoft/playwright v1.45.0..v1.47.0 and produce a markdown file?", + "coverage_of_official_changes": "str - (50 points) How well does the generated changelog capture the major features and changes from the official release notes? Full points if most key items are represented. Partial credit proportionally.", + "no_hallucinations": "str - (30 points) Is the changelog free of significant fabricated or hallucinated features that do not appear in the official release notes or actual commits?", + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", +} + + +@click.command() +@click.option( + "--test-id", + default=lambda: get_test_id_from_env_or_default("dev"), + help="Test ID for result file naming (defaults to EVAL_RECIPES_TEST_ID env var)", +) +@click.option( + "--output-dir", + type=click.Path(path_type=Path), + default=lambda: Path(__file__).parents[0], + help="Directory to write result file", +) +@click.option( + "--instructions-file", + type=click.Path(path_type=Path), + default=None, + help="Path to instructions file (defaults to ./instructions.txt in working directory)", +) +def main(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + """Test script for git_changelog_generator task.""" + return asyncio.run(run_test(test_id, output_dir, instructions_file)) + + +async def run_test(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) + + try: + logger.info("Running semantic test 1: Checking code architecture and implementation...") + result_1 = await semantic_test( + steps=STEPS_1_CODE_REVIEW, + rubric=RUBRIC_1_CODE_REVIEW, + context=instructions, + working_dir=Path("/project"), + ) + + logger.info("Running semantic test 2: Functional test against microsoft/playwright...") + result_2 = await semantic_test( + steps=STEPS_2_FUNCTIONAL, + rubric=RUBRIC_2_FUNCTIONAL, + context=instructions, + working_dir=Path("/project"), + ) + + final_score = result_1.score * 0.40 + result_2.score * 0.60 + + metadata = { + "instructions": instructions, + "semantic_test_1_code_review": { + "score": result_1.score, + "details": result_1.metadata, + }, + "semantic_test_2_functional": { + "score": result_2.score, + "details": result_2.metadata, + }, + "final_score": final_score, + "scoring_weights": { + "code_review": "40%", + "functional": "60%", + }, + } + + write_test_result(output_dir, test_id, final_score, metadata) + logger.info(f"Test completed with final score: {final_score:.1f}/100") + return 0 + + except Exception as e: + logger.error(f"Test failed with exception: {e}") + metadata = { + "instructions": instructions, + "error": str(e), + } + write_test_result(output_dir, test_id, 0, metadata) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data/tasks/git_changelog_generator/test_time_data/playwright_v1.46.0_release_notes.md b/data/tasks/git_changelog_generator/test_time_data/playwright_v1.46.0_release_notes.md new file mode 100644 index 0000000..9f4e28e --- /dev/null +++ b/data/tasks/git_changelog_generator/test_time_data/playwright_v1.46.0_release_notes.md @@ -0,0 +1,118 @@ +## TLS Client Certificates + +Playwright now allows to supply client-side certificates, so that server can verify them, as specified by TLS Client Authentication. + +When client certificates are specified, all browser traffic is routed through a proxy that establishes the secure TLS connection, provides client certificates to the server and validates server certificates. + +The following snippet sets up a client certificate for `https://example.com`: + +```ts +import { defineConfig } from '@playwright/test'; + +export default defineConfig({ + // ... + use: { + clientCertificates: [{ + origin: 'https://example.com', + certPath: './cert.pem', + keyPath: './key.pem', + passphrase: 'mysecretpassword', + }], + }, + // ... +}); +``` + +You can also provide client certificates to a particular [test project](https://playwright.dev/docs/api/class-testproject#test-project-use) or as a parameter of [browser.newContext()](https://playwright.dev/docs/api/class-browser#browser-new-context) and [apiRequest.newContext()](https://playwright.dev/docs/api/class-apirequest#api-request-new-context). + +## `--only-changed` cli option + +New CLI option `--only-changed` allows to only run test files that have been changed since the last git commit or from a specific git "ref". + +```sh +# Only run test files with uncommitted changes +npx playwright test --only-changed + +# Only run test files changed relative to the "main" branch +npx playwright test --only-changed=main +``` + +## Component Testing: New `router` fixture + +This release introduces an experimental `router` fixture to intercept and handle network requests in component testing. +There are two ways to use the router fixture: +- Call `router.route(url, handler)` that behaves similarly to [page.route()](https://playwright.dev/docs/api/class-page#page-route). +- Call `router.use(handlers)` and pass [MSW library](https://mswjs.io) request handlers to it. + +Here is an example of reusing your existing MSW handlers in the test. + +```ts +import { handlers } from '@src/mocks/handlers'; + +test.beforeEach(async ({ router }) => { + // install common handlers before each test + await router.use(...handlers); +}); + +test('example test', async ({ mount }) => { + // test as usual, your handlers are active + // ... +}); +``` + +This fixture is only available in [component tests](https://playwright.dev/docs/test-components#handling-network-requests). + +## UI Mode / Trace Viewer Updates +- Test annotations are now shown in UI mode. +- Content of text attachments is now rendered inline in the attachments pane. +- New setting to show/hide routing actions like [route.continue()](https://playwright.dev/docs/api/class-route#route-continue). +- Request method and status are shown in the network details tab. +- New button to copy source file location to clipboard. +- Metadata pane now displays the `baseURL`. + +## Miscellaneous +- New `maxRetries` option in [apiRequestContext.fetch()](https://playwright.dev/docs/api/class-apirequestcontext#api-request-context-fetch) which retries on the `ECONNRESET` network error. +- New option to [box a fixture](https://playwright.dev/docs/test-fixtures#box-fixtures) to minimize the fixture exposure in test reports and error messages. +- New option to provide a [custom fixture title](https://playwright.dev/docs/test-fixtures#custom-fixture-title) to be used in test reports and error messages. + +## Possibly breaking change + +Fixture values that are array of objects, when specified in the `test.use()` block, may require being wrapped into a fixture tuple. This is best seen on the example: + +```ts +import { test as base } from '@playwright/test'; + +// Define an option fixture that has an "array of objects" value +type User = { name: string, password: string }; +const test = base.extend<{ users: User[] }>({ + users: [ [], { option: true } ], +}); + +// Specify option value in the test.use block. +test.use({ + // WRONG: this syntax may not work for you + users: [ + { name: 'John Doe', password: 'secret' }, + { name: 'John Smith', password: 's3cr3t' }, + ], + // CORRECT: this syntax will work. Note extra [] around the value, and the "scope" property. + users: [[ + { name: 'John Doe', password: 'secret' }, + { name: 'John Smith', password: 's3cr3t' }, + ], { scope: 'test' }], +}); + +test('example test', async () => { + // ... +}); +``` + +## Browser Versions +- Chromium 128.0.6613.18 +- Mozilla Firefox 128.0 +- WebKit 18.0 + +This version was also tested against the following stable channels: +- Google Chrome 127 +- Microsoft Edge 127 + diff --git a/data/tasks/git_changelog_generator/test_time_data/playwright_v1.47.0_release_notes.md b/data/tasks/git_changelog_generator/test_time_data/playwright_v1.47.0_release_notes.md new file mode 100644 index 0000000..8ef0405 --- /dev/null +++ b/data/tasks/git_changelog_generator/test_time_data/playwright_v1.47.0_release_notes.md @@ -0,0 +1,58 @@ +## Network Tab improvements + +The Network tab in the UI mode and trace viewer has several nice improvements: + +- filtering by asset type and URL +- better display of query string parameters +- preview of font assets + +![Network tab now has filters](https://github.com/user-attachments/assets/4bd1b67d-90bd-438b-a227-00b9e86872e2) + +Credit to @kubajanik for these wonderful improvements! + +## `--tsconfig` CLI option + +By default, Playwright will look up the closest tsconfig for each imported file using a heuristic. You can now specify a single tsconfig file in the command line, and Playwright will use it for all imported files, not only test files: + +```sh +# Pass a specific tsconfig +npx playwright test --tsconfig tsconfig.test.json +``` + +## [APIRequestContext](https://playwright.dev/docs/api/class-apirequestcontext) now accepts [`URLSearchParams`](https://developer.mozilla.org/en-US/docs/Web/API/URLSearchParams) and `string` as query parameters + +You can now pass [`URLSearchParams`](https://developer.mozilla.org/en-US/docs/Web/API/URLSearchParams) and `string` as query parameters to [APIRequestContext](https://playwright.dev/docs/api/class-apirequestcontext): + +```ts +test('query params', async ({ request }) => { + const searchParams = new URLSearchParams(); + searchParams.set('userId', 1); + const response = await request.get( + 'https://jsonplaceholder.typicode.com/posts', + { + params: searchParams // or as a string: 'userId=1' + } + ); + // ... +}); +``` + +## Miscellaneous +- The `mcr.microsoft.com/playwright:v1.47.0` now serves a Playwright image based on Ubuntu 24.04 Noble. + To use the 22.04 jammy-based image, please use `mcr.microsoft.com/playwright:v1.47.0-jammy` instead. +- The `:latest`/`:focal`/`:jammy` tag for Playwright Docker images is no longer being published. Pin to a specific version for better stability and reproducibility. +- New option `behavior` in [page.removeAllListeners()](https://playwright.dev/docs/api/class-page#page-remove-all-listeners), [browser.removeAllListeners()](https://playwright.dev/docs/api/class-browser#browser-remove-all-listeners) and [browserContext.removeAllListeners()](https://playwright.dev/docs/api/class-browsercontext#browser-context-remove-all-listeners) to wait for ongoing listeners to complete. +- TLS client certificates can now be passed from memory by passing `cert` and `key` as buffers instead of file paths. +- Attachments with a `text/html` content type can now be opened in a new tab in the HTML report. This is useful for including third-party reports or other HTML content in the Playwright test report and distributing it to your team. +- `noWaitAfter` in [locator.selectOption()](https://playwright.dev/docs/api/class-locator#locator-select-option) was deprecated. +- We've seen reports of WebGL in Webkit misbehaving on GitHub Actions `macos-13`. We recommend upgrading GitHub Actions to `macos-14`. + +## Browser Versions +- Chromium 129.0.6668.29 +- Mozilla Firefox 130.0 +- WebKit 18.0 + +This version was also tested against the following stable channels: +- Google Chrome 128 +- Microsoft Edge 128 + diff --git a/data/tasks/github_docs_extractor/task.yaml b/data/tasks/github_docs_extractor/task.yaml index 47a955f..c69ef93 100644 --- a/data/tasks/github_docs_extractor/task.yaml +++ b/data/tasks/github_docs_extractor/task.yaml @@ -4,13 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - document_processing - - data_extraction + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | Build a tool that can extract documentation from any GitHub repository. diff --git a/data/tasks/image_tagging/task.yaml b/data/tasks/image_tagging/task.yaml index de79fde..8117fb3 100644 --- a/data/tasks/image_tagging/task.yaml +++ b/data/tasks/image_tagging/task.yaml @@ -4,7 +4,6 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - automation - ai_tool evaluation_configs: - type: score diff --git a/data/tasks/ipo_tracker/task.yaml b/data/tasks/ipo_tracker/task.yaml new file mode 100644 index 0000000..b12c227 --- /dev/null +++ b/data/tasks/ipo_tracker/task.yaml @@ -0,0 +1,25 @@ +name: ipo_tracker +task_info: + difficulty: medium + non_deterministic_evals: true + categories: + - cli_tool + - knowledge_worker +evaluation_configs: + - type: score + test_script: test.py + test_command: uv run --no-project /project/test.py +instructions: | + I need a CLI tool that finds companies planning to IPO on US stock exchanges within the next 365 days from today. + + The output should be a CSV file with these columns: + - company_name: Name of the company planning to IPO + - expected_date: Expected IPO date or timeframe (e.g., "Q2 2025", "June 2025", "2025") + - exchange: Target exchange (NYSE or NASDAQ) if known, otherwise "Unknown" + - sector: Industry sector (e.g., "Technology", "Healthcare", "Finance") + - description: Brief one-line description of what the company does + - source_url: URL of the source article or filing + + Include a README.md with usage instructions. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM or image API calls you need to make. +timeout: 3600 diff --git a/data/tasks/ipo_tracker/test.py b/data/tasks/ipo_tracker/test.py new file mode 100644 index 0000000..356870c --- /dev/null +++ b/data/tasks/ipo_tracker/test.py @@ -0,0 +1,110 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from pathlib import Path +import sys + +import click +from loguru import logger + +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( + get_instructions_from_file_or_default, + get_test_id_from_env_or_default, + write_test_result, +) + +STEPS = """\ +1. Find the README.md file that explains how to use the tool and what commands to run. +2. Based on the README: + - Install any required dependencies if not already installed. + - Determine the correct command to run the tool with default settings. +3. Run the tool. This may take up to 15 minutes. + - If the tool fails or errors out, note down an overall score of 0 +4. After completion, locate the output CSV file and examine its structure: + - Does it have headers matching the required columns (company_name, expected_date, exchange, \ +sector, description, source_url)? + - Is the CSV properly formatted (valid CSV syntax, parseable without errors)? + - Does it contain at least 5 company entries? +5. Validate data quality in the CSV: + - Are company names real companies (not placeholder or dummy data)? + - Are expected dates reasonable (within the next 365 days from now)? + - Are sector fields meaningful (e.g., "Technology", "Healthcare", not empty or generic)? + - Are descriptions meaningful one-line summaries? + - Are source URLs present and not empty? +6. Cross-validate companies against web searches: + - Pick 3-5 companies from the CSV output. + - For each company, search the web to independently verify that the company is actually planning or has filed for an IPO. + - Check if the search results corroborate the IPO claim for each company. + - A company is verified if you can find at least one credible source confirming its IPO plans. +7. Test source URL accessibility: + - Pick 2-3 source_url values from the CSV. + - Make HTTP requests to verify they are actually accessible (return 200/300-level responses).""" + +RUBRIC = { + "tool_runs_successfully": "str - (20 points) Does the tool run without errors?", + "csv_created_with_columns": "str - (10 points) Is a CSV file created with the required columns and proper formatting?", + "csv_has_enough_entries": "str - (10 points) Does the CSV contain at least 5 company entries?", + "companies_verified_via_search": "str - (30 points) When cross-validating 3-5 companies from the output by searching the web, can you find credible sources confirming their IPO plans? Award full points if most companies are verified, partial credit if some are verified, 0 if none can be verified or companies appear fabricated.", + "source_urls_accessible": "str - (10 points) When testing 2-3 source URLs with HTTP requests, are they accessible and relevant to the IPO claim?", + "data_fields_meaningful": "str - (20 points) Are company descriptions, sectors, and dates meaningful and specific (not empty or generic)?", + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", +} + + +@click.command() +@click.option( + "--test-id", + default=lambda: get_test_id_from_env_or_default("dev"), + help="Test ID for result file naming (defaults to EVAL_RECIPES_TEST_ID env var)", +) +@click.option( + "--output-dir", + type=click.Path(path_type=Path), + default=lambda: Path(__file__).parents[0], + help="Directory to write result file", +) +@click.option( + "--instructions-file", + type=click.Path(path_type=Path), + default=None, + help="Path to instructions file (defaults to ./instructions.txt in working directory)", +) +def main(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + """Test script for ipo_tracker task.""" + return asyncio.run(run_test(test_id, output_dir, instructions_file)) + + +async def run_test(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) + + try: + logger.info("Running semantic test: Running tool and validating output...") + result = await semantic_test( + steps=STEPS, + rubric=RUBRIC, + context=instructions, + working_dir=Path("/project"), + ) + + metadata = { + "instructions": instructions, + "semantic_test_result": result.metadata, + } + + write_test_result(output_dir, test_id, result.score, metadata) + logger.info(f"Test completed with final score: {result.score:.1f}/100") + return 0 + + except Exception as e: + logger.error(f"Test failed with exception: {e}") + metadata = { + "instructions": instructions, + "error": str(e), + } + write_test_result(output_dir, test_id, 0, metadata) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data/tasks/linkedin_drafting/task.yaml b/data/tasks/linkedin_drafting/task.yaml index 4359509..998c79b 100644 --- a/data/tasks/linkedin_drafting/task.yaml +++ b/data/tasks/linkedin_drafting/task.yaml @@ -4,12 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - writing + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I need help creating a tool for drafting my LinkedIn posts. My writing process has several stages and I'd like to automate it. @@ -29,6 +28,7 @@ instructions: | I have a folder with my past LinkedIn posts that the tool should analyze to learn my style and tone. The tool should output the finished post formatted for LinkedIn, any generated images, and optionally a shortened social media version. Look at other tools for inspiration if needed. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM or image API calls you need to make. test_time_files: - source: test_time_data dest: /project diff --git a/data/tasks/linkedin_drafting/test.py b/data/tasks/linkedin_drafting/test.py index 0b7a0c2..bbed27d 100644 --- a/data/tasks/linkedin_drafting/test.py +++ b/data/tasks/linkedin_drafting/test.py @@ -14,44 +14,52 @@ write_test_result, ) -# Define Semantic Test 1: Dependencies and Architecture - -AGENT_SDK_DEFINITION = """The solution should use an Agent SDK, such as Claude Agent/Code SDK, Microsoft Agent Framework, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI Codex CLI, or others that are similarly capable. These SDKs must have the following functionality: -- Automatic Context Management to ensure your agent doesn't run out of context. -- Rich tool ecosystem: File operations, code execution, web search, and MCP extensibility -- Excels at code generation and effectively gives the agent a "computer" where it can find appropriate files, write and edit files, lint the code, run it, debug, edit, and sometimes take these actions iteratively until it succeeds. -- APIs like OpenAI's chat completions or Responses API, Anthropic's Messages API, or Azure OpenAI alone are NOT sufficient and should not recieve any credit.""" - -STEPS_1_DEPENDENCIES = f"""{AGENT_SDK_DEFINITION} - -1. Explore the code that was created to understand the implementation. -2. Look for where dependencies are defined (e.g., pyproject.toml, requirements.txt, package.json, etc.) -3. Check if the solution uses an Agent SDK (see definition above): - - Check which dependencies are listed in dependency files - - Verify these dependencies are being imported in the code - - Confirm they are actually used in the implementation (not just imported) - - Verify the SDK provides the required agent capabilities, not just plain API calls -4. Check if the solution uses an image generation API: - - Look for OpenAI image generation API usage (DALL-E 3, gpt-image-1, or similar) - - Check if the code actually calls these APIs to generate images (not just create prompts) - - Verify imports and actual usage in the code (e.g., openai.images.generate or similar API calls) - - Confirm the images are actually being generated, not just prompts created -5. Look for evidence of separate stages or agents in the architecture: - - Check if there are separate components/functions/modules for key workflow parts: - * Image generation stage - * Research for references stage - * Review against existing style stage - - This could be implemented as: - * Multiple agents in an agentic loop - * Separate prompts for different stages - * Modular functions that handle distinct parts of the workflow - - The solution should NOT be just one monolithic prompt doing everything""" - -RUBRIC_1_DEPENDENCIES = { - "agent_sdk_identified": "str - Name of Agent SDK found, or 'None'", - "agent_sdk_usage": "str - (35 points) Does solution use qualifying Agent SDK (Claude Agent/Code SDK, Microsoft Agent Framework, Amplifier, OpenAI Codex CLI)? Must provide automatic context management, rich tool ecosystem, and iterative code capabilities. NOT plain API clients. Check dependency files, imports, and actual usage.", - "image_generation_api": "str - (35 points) Does the solution use OpenAI's image generation API (DALL-E 3, gpt-image-1, or similar)? Check dependencies and actual API calls in code. Verify images are actually generated, not just prompts.", - "separate_stages_or_agents": "str - (30 points) Does the solution have evidence of separate stages/agents for key workflow parts (image generation, research for references, review against existing style)? Could be agentic loops, separate prompts, or modular functions. Not just one monolithic prompt.", +# Define Semantic Test 1: Capabilities and Architecture + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks for each stage below.""" + +STEPS_1_CAPABILITIES = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code to understand the full implementation. +2. Check that it reads past posts from a directory to learn the user's writing style and tone. +3. Check that it accepts topic notes as input. +4. Check for a **topic/metaphor discovery** stage that uses an LLM to pick topics and find a metaphor or \ +story that makes technical concepts accessible. +5. Check for a **drafting** stage that uses an LLM to write creative paragraphs and draft explanations \ +for any jargon. +6. Check for an **image generation** stage that uses an image generation API (OpenAI DALL-E 3, \ +gpt-image-1, or similar) to actually produce images, not just create prompts. Verify imports and actual \ +API calls in the code (e.g., openai.images.generate or similar). +7. Check for a **research/references** stage that uses an LLM to find references and incorporate them \ +into the post. +8. Check for a **style matching** stage that uses an LLM to clean up the draft to match the user's \ +previous writing style from past posts. +9. Check for a **review/editing** stage that uses an LLM to check logic and reasoning and do a final review. +10. Check for a social media version output option (a shortened version of the post). +11. Check that the implementation has separate stages or components -- not one monolithic prompt doing \ +everything. This could be multiple agents, separate prompts, or modular functions. +12. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK is used, \ +check which model it is configured with. Check the model identifier in the code against these references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_CAPABILITIES = { + "reads_past_posts": "str - (5 points) Does the code read past posts from a directory to learn style/tone?", + "accepts_topic_notes": "str - (5 points) Does the code accept topic notes as input?", + "topic_metaphor_discovery": "str - (10 points) Is there a stage that uses an LLM to pick topics and find metaphors/stories for technical concepts? Agent SDKs with built-in LLM capabilities count.", + "drafting_stage": "str - (10 points) Is there a stage that uses an LLM to write creative paragraphs and draft jargon explanations? Agent SDKs with built-in LLM capabilities count.", + "image_generation": "str - (15 points) Does the solution use an image generation API (DALL-E 3, gpt-image-1, or similar) to actually produce images? Verify actual API calls, not just prompt creation.", + "research_references": "str - (10 points) Is there a stage that uses an LLM to find references and incorporate them? Agent SDKs with built-in LLM capabilities count.", + "style_matching": "str - (5 points) Is there a stage that uses an LLM to match the user's past writing style? Agent SDKs with built-in LLM capabilities count.", + "review_editing": "str - (5 points) Is there a stage that uses an LLM to check logic/reasoning and do a final review? Agent SDKs with built-in LLM capabilities count.", + "social_media_option": "str - (5 points) Does the tool have an option to output a shortened social media version?", + "separate_stages": "str - (5 points) Does the implementation have separate stages/components (not one monolithic prompt)? Could be agentic loops, separate prompts, or modular functions.", + "uses_recent_model": "str - (25 points) Does it use a recent model from Anthropic (see https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI (see https://platform.openai.com/docs/models)? If an Agent SDK is used, check which model it is configured with. 5 points partial credit if a model is used but it is not recent.", "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } @@ -132,10 +140,10 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) try: - logger.info("Running semantic test 1: Checking dependencies and architecture...") + logger.info("Running semantic test 1: Checking capabilities and architecture...") result_1 = await semantic_test( - steps=STEPS_1_DEPENDENCIES, - rubric=RUBRIC_1_DEPENDENCIES, + steps=STEPS_1_CAPABILITIES, + rubric=RUBRIC_1_CAPABILITIES, context=instructions, working_dir=Path("/project"), ) @@ -154,7 +162,7 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non metadata = { "instructions": instructions, - "semantic_test_1_dependencies_and_architecture": { + "semantic_test_1_capabilities_and_architecture": { "score": result_1.score, "details": result_1.metadata, }, @@ -164,7 +172,7 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non }, "final_score": final_score, "scoring_weights": { - "dependencies_and_architecture": "30%", + "capabilities_and_architecture": "30%", "run_validate_quality": "70%", }, } diff --git a/data/tasks/markdown_deck_converter/task.yaml b/data/tasks/markdown_deck_converter/task.yaml index 9111a64..4f62d6a 100644 --- a/data/tasks/markdown_deck_converter/task.yaml +++ b/data/tasks/markdown_deck_converter/task.yaml @@ -4,13 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - document_processing - - automation + - knowledge_worker evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I've been writing my presentation decks in markdown because it's so much faster than clicking around in PowerPoint, but I need to get them into PowerPoint format to share with my team and for final review. The problem is every markdown-to-slides tool I've tried creates presentations that are a nightmare to edit in PowerPoint - everything is locked down with absolute positioning and you can't move anything without breaking the whole layout. diff --git a/data/tasks/news_research_tool/task.yaml b/data/tasks/news_research_tool/task.yaml index fddd158..55f1ef2 100644 --- a/data/tasks/news_research_tool/task.yaml +++ b/data/tasks/news_research_tool/task.yaml @@ -4,15 +4,13 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - research - - web_scraping + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | - Build a tool that uses the ddgs (Dux Distributed Global Search) Python library to find and summarize the latest news on a given topic. + Build a tool that uses the ddgs (Dux Distributed Global Search, https://pypi.org/project/ddgs/) Python library to find and summarize the latest news on a given topic. The tool should: 1. Use ddgs to search for high quality recent news articles on the specified topic @@ -25,4 +23,5 @@ instructions: | - A summary of the latest developments on the topic with citations and references to source articles Make sure to include a README.md with usage instructions and examples. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM or image API calls you need to make. timeout: 4500 diff --git a/data/tasks/news_research_tool/test.py b/data/tasks/news_research_tool/test.py index 01efed3..fb5bfd9 100644 --- a/data/tasks/news_research_tool/test.py +++ b/data/tasks/news_research_tool/test.py @@ -14,104 +14,87 @@ write_test_result, ) -# Define Semantic Test 1: Check Dependencies - -AGENT_SDK_DEFINITION = """The solution should use an Agent SDK, such as Claude Agent/Code SDK, Microsoft Agent Framework, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI Codex CLI, or others that are similarly capable. These SDKs must have the following functionality: -- Automatic Context Management to ensure your agent doesn't run out of context. -- Rich tool ecosystem: File operations, code execution, web search, and MCP extensibility -- Excels at code generation and effectively gives the agent a "computer" where it can find appropriate files, write and edit files, lint the code, run it, debug, edit, and sometimes take these actions iteratively until it succeeds. -- APIs like OpenAI's chat completions or Responses API, Anthropic's Messages API, or Azure OpenAI alone are NOT sufficient and should not recieve any credit.""" - -STEPS_1_DEPENDENCIES = f"""{AGENT_SDK_DEFINITION} - -1. Explore the code in the project directory to find where dependencies are defined. -2. Look for dependency files like pyproject.toml, requirements.txt, package.json, etc. -3. Check if the code uses: - - DDGS / Dux Distributed Global Search library for web search - - An Agent SDK (see definition above) for AI interactions -4. Verify these dependencies are actually imported and used in the implementation. -5. Confirm the AI framework is an Agent SDK with the required capabilities, not just a plain API client.""" - -RUBRIC_1_DEPENDENCIES = { - "duckduckgo_dependency": "str - (50 points) Does the solution use ddgs Python library for searching news? Check both dependency files and actual imports/usage in code.", - "agent_sdk_identified": "str - Name of Agent SDK found, or 'None'", - "agent_sdk_dependency": "str - (50 points) Does the solution use a qualifying Agent SDK (Claude Agent/Code SDK, Microsoft Agent Framework, Amplifier, OpenAI Codex CLI) for AI interactions? Must provide automatic context management, rich tool ecosystem, and iterative code capabilities. NOT plain API clients.", +# Define Semantic Test 1: Capabilities and Dependencies + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks for each stage below.""" + +STEPS_1_CAPABILITIES = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code in the project directory to understand the full implementation. +2. Look for where dependencies are defined (e.g., pyproject.toml, requirements.txt, package.json, etc.) +3. Check that the solution uses the **ddgs** (Dux Distributed Global Search, \ +https://pypi.org/project/ddgs/) Python library for web search. Verify it is listed in dependency files \ +and actually imported and used in the code. +4. Check that the solution uses an **LLM to synthesize** the search results into a digestible report \ +(not just raw search output). This could be via direct API calls or an Agent SDK. +5. Check for a **supporting information** stage: after the initial ddgs search returns links, the tool \ +should find additional supporting information (e.g., fetching article content, follow-up searches). +6. Check that the output is saved to a **markdown file** with proper formatting. +7. Check that the CLI accepts a **topic as input** (command-line argument or option). +8. Check that a **README.md** exists with usage instructions and examples. +9. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK is used, \ +check which model it is configured with. Check the model identifier in the code against these references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_CAPABILITIES = { + "ddgs_dependency": "str - (25 points) Does the solution use the ddgs Python library for searching news? Check both dependency files and actual imports/usage in code.", + "llm_synthesis": "str - (15 points) Does the solution use an LLM to synthesize search results into a digestible report (not just raw output)? Agent SDKs with built-in LLM capabilities count.", + "supporting_information": "str - (10 points) After initial search, does the tool find supporting information (fetching article content, follow-up searches)?", + "markdown_output": "str - (5 points) Does the tool save output to a properly formatted markdown file?", + "cli_accepts_topic": "str - (5 points) Does the CLI accept a topic as input?", + "readme_exists": "str - (5 points) Does a README.md exist with usage instructions and examples?", + "grounded_citations": "str - (10 points) Does the implementation ensure content is grounded with proper citations including URLs and publication dates?", + "uses_recent_model": "str - (25 points) Does it use a recent model from Anthropic (see https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI (see https://platform.openai.com/docs/models)? If an Agent SDK is used, check which model it is configured with. 5 points partial credit if a model is used but it is not recent.", "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } -# Define Semantic Test 2: Run Tool and Validate Output Structure +# Define Semantic Test 2: Run Tool and Validate Output -STEPS_2_RUN_AND_VALIDATE = """1. Find the README.md file that explains how to use the tool and what commands to run. -2. Based on the README, - - Install any required dependencies if not already installed. - - Determine the correct command to run the tool with a simple test topic like "Python programming news". -3. Run the tool. This may take up to 15 minutes as it needs to search the web and process results. -4. Check if the tool runs successfully without errors. - - If the tool fails, note down an overall score of 0! -5. Verify the CLI interface: - - Accepts a topic as input (command-line argument or interactive prompt) - - Provides clear output or progress messages - - Completes successfully -6. Find the output file(s): - - Should be a markdown file (.md extension) - - Should contain the research results -7. Examine the markdown file structure: - - Is there a summary section with findings? - - Are there citations/references? - - Are URLs included in the citations? - - Are any dates included (publication dates of articles)?""" - -RUBRIC_2_RUN_AND_VALIDATE = { - "readme_exists": "str - (5 points) Does a README.md exist with clear usage and installation instructions?", - "tool_runs_successfully": "str - (20 points) Does the tool run without errors when given a topic?", - "cli_accepts_topic": "str - (10 points) Does the CLI properly accept a topic as input?", - "markdown_output_created": "str - (15 points) Is a markdown file created with the results?", - "has_summary": "str - (15 points) Does the markdown file contain a summary of findings?", - "has_citations": "str - (15 points) Are there citations/references to source articles?", - "citations_have_urls": "str - (10 points) Do citations include URLs to the source articles?", - "citations_have_dates": "str - (10 points) Do citations include publication dates?", - "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", -} - - -# Define Semantic Test 3: Run with Specific Topic and Validate Quality - -STEPS_3_QUALITY_VALIDATION = """1. Find the README.md file that explains how to use the tool and what commands to run. -2. Based on the README, - - Install any required dependencies if not already installed. - - Determine the correct command to run the tool with a simple test topic like "Python programming news".1. Based on the README, determine the command to run the tool with this specific topic: \ +STEPS_2_RUN_AND_VALIDATE = """\ +1. Find the README.md file that explains how to use the tool and what commands to run. +2. Based on the README: + - Install any required dependencies if not already installed. + - Determine the correct command to run the tool with this topic: \ "artificial intelligence breakthroughs 2025" -3. Run the tool with this topic. This may take several minutes. - - If the tool fails, note down an overall score of 0! -4. After completion, find and read the generated markdown file. -5. Evaluate the search quality: - - Are the articles actually relevant to AI breakthroughs? - - Did it pick high quality sources? +3. Run the tool. This may take up to 15 minutes as it needs to search the web and process results. + - If the tool fails or errors out, note down an overall score of 0! +4. After completion, find the output file(s): + - Should be a markdown file (.md extension) containing the research results +5. Evaluate the output structure: + - Does the markdown file contain a summary that synthesizes findings? + - Are there citations/references to source articles? + - Do citations include URLs to the source articles? + - Do citations include publication dates? +6. Evaluate the output quality: + - Are the found articles actually relevant to AI breakthroughs? - Are the articles recent (2024-2025)? -6. Evaluate the AI analysis quality: - - Does it synthesize information from multiple sources? -7. Evaluate citation quality: + - Does the summary synthesize information from multiple sources (not just list them)? - Are there at least 3-5 different source articles cited? - - Does each citation include the article URL? - - Are there publication dates included in the generated synthesis? - - Are the citations properly formatted and easy to verify? -8. Evaluate content grounding: +7. Evaluate content grounding: - Is the summary based on actual information from the cited sources? - Are there any unsupported claims or hallucinations? - Does it properly attribute information to specific sources?""" -RUBRIC_3_QUALITY_VALIDATION = { - "tool_runs_successfully": "str - (10 points) Does the tool run successfully with the test topic?", - "articles_are_relevant": "str - (20 points) Are the found articles actually relevant to the topic?", - "articles_are_recent": "str - (20 points) Are the articles recent (2024-2025)?", +RUBRIC_2_RUN_AND_VALIDATE = { + "readme_exists": "str - (5 points) Does a README.md exist with clear usage and installation instructions?", + "tool_runs_successfully": "str - (20 points) Does the tool run without errors when given the topic?", + "markdown_output_created": "str - (10 points) Is a markdown file created with the results?", + "has_summary": "str - (10 points) Does the markdown file contain a summary of findings?", + "articles_are_relevant": "str - (10 points) Are the found articles actually relevant to the topic?", + "articles_are_recent": "str - (10 points) Are the articles recent (2024-2025)?", "synthesizes_multiple_sources": "str - (10 points) Does it synthesize information from multiple sources?", - "has_multiple_citations": "str - (10 points) Are there at least 3-5 different source articles cited?", - "citations_include_urls": "str - (5 points) Do all citations include URLs?", - "citations_include_dates": "str - (5 points) Do all citations include publication dates?", - "includes_dates": "str - (5 points) Are publication dates included in the summary or citations?", + "has_multiple_citations": "str - (5 points) Are there at least 3-5 different source articles cited?", + "citations_have_urls": "str - (5 points) Do citations include URLs to the source articles?", + "citations_have_dates": "str - (5 points) Do citations include publication dates?", "content_is_grounded": "str - (5 points) Is the content grounded in actual source material without hallucinations?", - "proper_attribution": "str - (10 points) Is information properly attributed to specific sources?", + "proper_attribution": "str - (5 points) Is information properly attributed to specific sources?", "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } @@ -143,15 +126,15 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) try: - logger.info("Running semantic test 1: Checking for correct dependencies...") + logger.info("Running semantic test 1: Checking capabilities and dependencies...") result_1 = await semantic_test( - steps=STEPS_1_DEPENDENCIES, - rubric=RUBRIC_1_DEPENDENCIES, + steps=STEPS_1_CAPABILITIES, + rubric=RUBRIC_1_CAPABILITIES, context=instructions, working_dir=Path("/project"), ) - logger.info("Running semantic test 2: Running tool and validating output structure...") + logger.info("Running semantic test 2: Running tool and validating output...") result_2 = await semantic_test( steps=STEPS_2_RUN_AND_VALIDATE, rubric=RUBRIC_2_RUN_AND_VALIDATE, @@ -159,21 +142,13 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non working_dir=Path("/project"), ) - logger.info("Running semantic test 3: Running tool with specific topic and validating quality...") - result_3 = await semantic_test( - steps=STEPS_3_QUALITY_VALIDATION, - rubric=RUBRIC_3_QUALITY_VALIDATION, - context=instructions, - working_dir=Path("/project"), - ) - # Calculate final score with weighted average - # Weights: dependencies (20%), run and validate (30%), quality validation (50%) - final_score = result_1.score * 0.20 + result_2.score * 0.30 + result_3.score * 0.50 + # Weights: capabilities (30%), run and validate (70%) + final_score = result_1.score * 0.30 + result_2.score * 0.70 metadata = { "instructions": instructions, - "semantic_test_1_dependencies": { + "semantic_test_1_capabilities": { "score": result_1.score, "details": result_1.metadata, }, @@ -181,15 +156,10 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non "score": result_2.score, "details": result_2.metadata, }, - "semantic_test_3_quality_validation": { - "score": result_3.score, - "details": result_3.metadata, - }, "final_score": final_score, "scoring_weights": { - "dependencies": "20%", - "run_and_validate": "30%", - "quality_validation": "50%", + "capabilities": "30%", + "run_and_validate": "70%", }, } diff --git a/data/tasks/pdf-hr-q1/task.yaml b/data/tasks/pdf-hr-q1/task.yaml index 2f6b1f6..649f097 100644 --- a/data/tasks/pdf-hr-q1/task.yaml +++ b/data/tasks/pdf-hr-q1/task.yaml @@ -8,7 +8,6 @@ evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I am doing some data analysis based on `hr_report.pdf`. Can you help me answer this: What is the average base salary for the Consumer Home & Office Products division at the Mexico City Manufacturing plant in MX$? Please place the answer in a text file called "data_analysis_answer.txt" that just contains the answer. diff --git a/data/tasks/pdf-hr-q2/task.yaml b/data/tasks/pdf-hr-q2/task.yaml index 0a11aa5..c4ddfd2 100644 --- a/data/tasks/pdf-hr-q2/task.yaml +++ b/data/tasks/pdf-hr-q2/task.yaml @@ -8,7 +8,6 @@ evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I am doing some data analysis based on `hr_report.pdf`. Can you help me answer this: How many employees have a level of M1 (manager) at the entire company? Please place the answer in a text file called "data_analysis_answer.txt" that just contains the answer. diff --git a/data/tasks/pdf-hr-q3/task.yaml b/data/tasks/pdf-hr-q3/task.yaml index 011df46..7aa787d 100644 --- a/data/tasks/pdf-hr-q3/task.yaml +++ b/data/tasks/pdf-hr-q3/task.yaml @@ -8,7 +8,6 @@ evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I am doing some data analysis based on `hr_report.pdf`. Can you help me answer this: What is the median base salary for employees paid in USD at level L4 in the AMER region? Please place the answer in a text file called "data_analysis_answer.txt" that just contains the answer. diff --git a/data/tasks/pdf-hr-q4/task.yaml b/data/tasks/pdf-hr-q4/task.yaml index 4f47042..0b3c9e9 100644 --- a/data/tasks/pdf-hr-q4/task.yaml +++ b/data/tasks/pdf-hr-q4/task.yaml @@ -8,7 +8,6 @@ evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I am doing some data analysis based on `hr_report.pdf`. Can you help me answer this: For all employees paid in USD at the St. Paul Headquarters, what is the median base salary for each performance rating? Please place the answer in a text file called "data_analysis_answer.txt" that just contains the answer. diff --git a/data/tasks/pdf-hr-q5/task.yaml b/data/tasks/pdf-hr-q5/task.yaml index 8f08a96..0ccdbed 100644 --- a/data/tasks/pdf-hr-q5/task.yaml +++ b/data/tasks/pdf-hr-q5/task.yaml @@ -8,7 +8,6 @@ evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I am doing some data analysis based on `hr_report.pdf`. Can you help me answer this: Which division had the largest increase in the percentage of employees rated 4 or 5 between 2021 and 2024? Include the division name and the percent change. Please place the answer in a text file called "data_analysis_answer.txt" that just contains the answer. diff --git a/data/tasks/pixel_art_generator/task.yaml b/data/tasks/pixel_art_generator/task.yaml new file mode 100644 index 0000000..f9fcae9 --- /dev/null +++ b/data/tasks/pixel_art_generator/task.yaml @@ -0,0 +1,33 @@ +name: pixel_art_generator +task_info: + difficulty: hard + non_deterministic_evals: true + categories: + - cli_tool + - ai_tool +evaluation_configs: + - type: score + test_script: test.py + test_command: uv run --no-project /project/test.py +instructions: | + I want a CLI tool called "pixelart" that generates game-ready pixel art sprites. + Place the code under scenarios/pixel_art_generator. + + It should use an OpenAI image model to generate the initial image, then post-process + it down to actual pixel art: the right dimensions, limited color palette, transparent + background, no blurring. The output should be a PNG I can drop into a game engine. + + I need three commands: + - `pixelart generate "a knight with a sword" --size 32 --output knight.png` to generate + a single sprite. Size should be configurable (8, 16, 32, 64) and default to 32. I also + want a --palette-size option to control the number of colors (default 16). + - `pixelart inspect sprite.png` to show me the properties of a sprite (dimensions, colors, + transparency, etc.) + - `pixelart batch descriptions.json --size 32 --output-dir sprites/` to generate multiple + sprites from a JSON file where each entry has a "name" and "description". + + Include a README with usage examples. You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available. +test_time_files: + - source: test_time_data + dest: /project +timeout: 3600 diff --git a/data/tasks/pixel_art_generator/test.py b/data/tasks/pixel_art_generator/test.py new file mode 100644 index 0000000..8d5f319 --- /dev/null +++ b/data/tasks/pixel_art_generator/test.py @@ -0,0 +1,207 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from pathlib import Path +import sys + +import click +from loguru import logger + +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( + get_instructions_from_file_or_default, + get_test_id_from_env_or_default, + write_test_result, +) + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct OpenAI API calls or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable for the image \ +generation API calls.""" + +IMAGE_GEN_DOCS_URL = "https://platform.openai.com/docs/guides/image-generation" + +# region Semantic Test 1: Architecture and Code Quality + +STEPS_1_ARCHITECTURE = f"""\ +{AGENT_SDK_NOTE} + +1. First, look up the latest OpenAI image generation model by fetching {IMAGE_GEN_DOCS_URL}. \ +Note the recommended model name for image generation. +2. Explore the code under scenarios/pixel_art_generator/ to understand the implementation. +3. Check that the code uses an OpenAI image generation model. Compare the model used against \ +what you found in the docs -- is it the latest recommended model? +4. Check that the code requests a transparent background from the API so sprites can be used \ +in games. +5. Check that the post-processing pipeline produces actual pixel art: + - Resizing must avoid blurring (nearest-neighbor interpolation, not bilinear/bicubic/lanczos) + - Colors are reduced to a limited palette + - Output is a PNG with an alpha channel (transparency) +6. Verify the three CLI commands exist: generate, inspect, and batch, with the options \ +described in the instructions (--size, --palette-size, --output, --output-dir). +7. Check for a README with usage examples.""" + +RUBRIC_1_ARCHITECTURE = { + "uses_latest_model": ( + "str - (20 points) Does the code use the latest OpenAI image generation model " + "from the docs? State which model the docs recommend and which model the code uses. " + "Full points if it matches, 10 points partial credit if it uses an older OpenAI image model." + ), + "transparent_background": ( + "str - (15 points) Does the code request a transparent background from the API so sprites are game-ready?" + ), + "no_blurry_resize": ( + "str - (15 points) Does resizing use nearest-neighbor interpolation? " + "Any smoothing/blurring interpolation method is wrong for pixel art." + ), + "color_reduction": "str - (15 points) Does the code reduce colors to a limited palette?", + "png_with_alpha": "str - (10 points) Does the code output PNGs with an alpha channel for transparency?", + "cli_commands": "str - (10 points) Are the three commands (generate, inspect, batch) present with the expected options?", + "readme_exists": "str - (5 points) Does a README exist with usage examples?", + "code_quality": "str - (10 points) Is the code well-organized with reasonable error handling?", + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", +} + +# endregion + +# region Semantic Test 2: Functional Run + +STEPS_2_FUNCTIONAL = """\ +This test validates that the tool actually works end-to-end. A validation script is \ +available at /project/test_time_data/validate_sprites.py that checks sprite properties. + +1. Read the README under scenarios/pixel_art_generator/ to learn how to run the tool. +2. Install any required dependencies. +3. Create the /project/test_outputs/ directory. + +4. Generate three sprites at different sizes: + pixelart generate "a knight with a sword and shield" --size 32 --palette-size 16 \ +--output /project/test_outputs/knight.png + pixelart generate "a health potion bottle" --size 64 --palette-size 8 \ +--output /project/test_outputs/potion.png + pixelart generate "a treasure chest with gold coins" --size 16 --palette-size 12 \ +--output /project/test_outputs/chest.png + +5. Run the inspect command on one of the sprites and check the output looks reasonable. + +6. Test batch mode. Create /project/test_outputs/batch_input.json with: + [ + {"name": "tree", "description": "a pixel art oak tree"}, + {"name": "gem", "description": "a blue diamond gemstone"} + ] + Then run: pixelart batch /project/test_outputs/batch_input.json --size 32 \ +--output-dir /project/test_outputs/batch/ + +7. Run the validation script on the generated sprites: + uv run --with Pillow /project/test_time_data/validate_sprites.py \ +/project/test_outputs/ --expected-size 32 --expected-palette 16 + Note: this will validate all PNGs recursively. The --expected-size and \ +--expected-palette flags are global hints -- some sprites have different sizes, \ +so focus on whether each sprite matches what was requested for it. + +8. Read the validation output carefully. Check if sprites pass: dimensions, \ +background removal (transparency), color palette limits, and anti-aliasing checks. + +9. Visually assess: do the sprites look like pixel art you could use in a game?""" + +RUBRIC_2_FUNCTIONAL = { + "generate_works": "str - (15 points) Does the generate command run and produce PNG files?", + "inspect_works": "str - (5 points) Does the inspect command show useful sprite info?", + "correct_dimensions": "str - (15 points) Do sprites match their requested --size (32x32, 64x64, 16x16)?", + "correct_palette": "str - (10 points) Are color counts within the requested --palette-size limits?", + "has_transparency": "str - (10 points) Do sprites have transparent backgrounds (not white or solid)?", + "no_antialiasing": "str - (5 points) Do sprites pass the anti-aliasing check from the validation script?", + "batch_works": "str - (10 points) Does batch mode produce the expected sprite files?", + "looks_like_pixel_art": ( + "str - (15 points) Do the sprites look like actual pixel art? Blocky, limited colors, no smooth gradients." + ), + "game_ready": ( + "str - (15 points) Could these sprites be dropped into a game? " + "Transparent background, clean edges, recognizable subjects." + ), + "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", +} + +# endregion + + +@click.command() +@click.option( + "--test-id", + default=lambda: get_test_id_from_env_or_default("dev"), + help="Test ID for result file naming (defaults to EVAL_RECIPES_TEST_ID env var)", +) +@click.option( + "--output-dir", + type=click.Path(path_type=Path), + default=lambda: Path(__file__).parents[0], + help="Directory to write result file", +) +@click.option( + "--instructions-file", + type=click.Path(path_type=Path), + default=None, + help="Path to instructions file (defaults to ./instructions.txt in working directory)", +) +def main(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + """Test script for pixel_art_generator task.""" + return asyncio.run(run_test(test_id, output_dir, instructions_file)) + + +async def run_test(test_id: str, output_dir: Path, instructions_file: Path | None) -> int: + instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) + + try: + logger.info("Running semantic test 1: Checking architecture and implementation...") + result_1 = await semantic_test( + steps=STEPS_1_ARCHITECTURE, + rubric=RUBRIC_1_ARCHITECTURE, + context=instructions, + working_dir=Path("/project"), + ) + + logger.info("Running semantic test 2: Functional testing (generation, inspection, batch)...") + result_2 = await semantic_test( + steps=STEPS_2_FUNCTIONAL, + rubric=RUBRIC_2_FUNCTIONAL, + context=instructions, + working_dir=Path("/project"), + ) + + # Weights: architecture (30%), functional (70%) + final_score = result_1.score * 0.30 + result_2.score * 0.70 + + metadata = { + "instructions": instructions, + "semantic_test_1_architecture": { + "score": result_1.score, + "details": result_1.metadata, + }, + "semantic_test_2_functional": { + "score": result_2.score, + "details": result_2.metadata, + }, + "final_score": final_score, + "scoring_weights": { + "architecture": "30%", + "functional": "70%", + }, + } + + write_test_result(output_dir, test_id, final_score, metadata) + logger.info(f"Test completed with final score: {final_score:.1f}/100") + return 0 + + except Exception as e: + logger.error(f"Test failed with exception: {e}") + metadata = { + "instructions": instructions, + "error": str(e), + } + write_test_result(output_dir, test_id, 0, metadata) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data/tasks/pixel_art_generator/test_time_data/validate_sprites.py b/data/tasks/pixel_art_generator/test_time_data/validate_sprites.py new file mode 100644 index 0000000..4899817 --- /dev/null +++ b/data/tasks/pixel_art_generator/test_time_data/validate_sprites.py @@ -0,0 +1,112 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Validate pixel art sprites for game readiness. + +Checks each PNG in a directory for: correct dimensions, alpha channel, +transparent background, limited color palette, and anti-aliasing artifacts. +""" + +import json +from pathlib import Path +import sys + +import click +from PIL import Image + + +def validate_sprite(path: Path, expected_size: int | None, expected_palette: int | None) -> dict: + """Validate a single sprite and return a results dict.""" + result: dict = { + "file": path.name, + "width": 0, + "height": 0, + "mode": "", + "unique_colors": 0, + "transparency_pct": 0.0, + "checks": {}, + } + + img = Image.open(path) + result["width"] = img.width + result["height"] = img.height + result["mode"] = img.mode + + if img.mode != "RGBA": + img = img.convert("RGBA") + + # Dimensions + if expected_size is not None: + result["checks"]["dimensions"] = ( + "PASS" if img.width == expected_size and img.height == expected_size else "FAIL" + ) + result["checks"]["dimensions_detail"] = ( + f"expected {expected_size}x{expected_size}, got {img.width}x{img.height}" + ) + + # Alpha channel + result["checks"]["has_alpha"] = "PASS" if "A" in img.getbands() else "FAIL" + + # Transparency percentage + pixels = list(img.get_flattened_data()) + total = len(pixels) + transparent = sum(1 for p in pixels if p[3] == 0) + result["transparency_pct"] = round(transparent / total * 100.0, 1) if total > 0 else 0.0 + result["checks"]["background_removed"] = "PASS" if result["transparency_pct"] > 10.0 else "FAIL" + result["checks"]["background_detail"] = f"{result['transparency_pct']}% transparent pixels" + + # Color count + colors = img.getcolors(maxcolors=total) + num_colors = len(colors) if colors else -1 + result["unique_colors"] = num_colors + if expected_palette is not None: + result["checks"]["palette"] = "PASS" if 0 < num_colors <= expected_palette else "FAIL" + result["checks"]["palette_detail"] = f"expected <={expected_palette} colors, got {num_colors}" + + # Anti-aliasing check: in pixel art, very few colors should appear only once or twice + if num_colors > 0 and colors: + color_counts = {tuple(c[1]): c[0] for c in colors} + rare = sum(1 for count in color_counts.values() if count < 3) + ratio = round(rare / len(color_counts) * 100.0, 1) + result["checks"]["no_antialiasing"] = "PASS" if ratio <= 30.0 else "FAIL" + result["checks"]["no_antialiasing_detail"] = f"{ratio}% of colors appear fewer than 3 times" + + return result + + +@click.command() +@click.argument("directory", type=click.Path(exists=True, path_type=Path)) +@click.option("--expected-size", type=int, default=None, help="Expected width and height in pixels") +@click.option("--expected-palette", type=int, default=None, help="Maximum number of unique colors") +def main(directory: Path, expected_size: int | None, expected_palette: int | None) -> None: + """Validate all PNG sprites in DIRECTORY.""" + png_files = sorted(directory.glob("**/*.png")) + if not png_files: + click.echo(f"No PNG files found in {directory}") + sys.exit(1) + + results = [] + for png in png_files: + try: + results.append(validate_sprite(png, expected_size, expected_palette)) + except Exception as e: + results.append({"file": png.name, "error": str(e)}) + + # Print summary + click.echo(json.dumps(results, indent=2)) + + # Print pass/fail overview + click.echo("\n--- Summary ---") + for r in results: + if "error" in r: + click.echo(f" {r['file']}: ERROR - {r['error']}") + continue + checks = r.get("checks", {}) + failures = [k for k, v in checks.items() if v == "FAIL" and not k.endswith("_detail")] + if failures: + click.echo(f" {r['file']}: FAIL ({', '.join(failures)})") + else: + click.echo(f" {r['file']}: PASS") + + +if __name__ == "__main__": + main() diff --git a/data/tasks/ppt-1/task.yaml b/data/tasks/ppt-1/task.yaml index 0201354..6257d2f 100644 --- a/data/tasks/ppt-1/task.yaml +++ b/data/tasks/ppt-1/task.yaml @@ -8,7 +8,6 @@ task_info: evaluation_configs: - type: comparison guidelines: null -dockerfile_portion: "" instructions: | First, extract the ppt-1.zip file: unzip ppt-1.zip diff --git a/data/tasks/ppt-2/task.yaml b/data/tasks/ppt-2/task.yaml index fb95fe9..6d493fc 100644 --- a/data/tasks/ppt-2/task.yaml +++ b/data/tasks/ppt-2/task.yaml @@ -9,7 +9,6 @@ task_info: evaluation_configs: - type: comparison guidelines: null -dockerfile_portion: "" instructions: | First, extract the ppt-2.zip file: unzip ppt-2.zip diff --git a/data/tasks/ppt-3/task.yaml b/data/tasks/ppt-3/task.yaml index e48489f..03f41f0 100644 --- a/data/tasks/ppt-3/task.yaml +++ b/data/tasks/ppt-3/task.yaml @@ -9,7 +9,6 @@ task_info: evaluation_configs: - type: comparison guidelines: null -dockerfile_portion: "" instructions: | First, extract the ppt-3.zip file: unzip ppt-3.zip diff --git a/data/tasks/product_review_finder/task.yaml b/data/tasks/product_review_finder/task.yaml index c2d33b3..779bacb 100644 --- a/data/tasks/product_review_finder/task.yaml +++ b/data/tasks/product_review_finder/task.yaml @@ -4,13 +4,13 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - research - - web_scraping + - ai_tool + - knowledge_worker + - web evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | I need a CLI tool that finds professional product reviews from reputable sources. Put all the code under scenarios/product_review_finder. @@ -18,7 +18,7 @@ instructions: | Here's how it should work: - First, use ddgs (Dux Distributed Global Search) to find reviews from reputable tech sites, consumer reports, and trusted review publications. Analyze the search results and extract detailed content from the best sources. + First, use ddgs (Dux Distributed Global Search, https://pypi.org/project/ddgs/) to find reviews from reputable tech sites, consumer reports, and trusted review publications. Analyze the search results and extract detailed content from the best sources. From there, have a writer component that creates a first draft of the markdown report including overall consensus, key strengths/weaknesses, ratings, quotes, citations with URLs and dates, and a final recommendation. @@ -35,4 +35,5 @@ instructions: | Save the final output to a timestamped markdown file named after the product. Include a README with usage examples. + You have an OPENAI_API_KEY and ANTHROPIC_API_KEY available for any LLM or image API calls you need to make. timeout: 4500 diff --git a/data/tasks/product_review_finder/test.py b/data/tasks/product_review_finder/test.py index f94c7e8..a8bc4f3 100644 --- a/data/tasks/product_review_finder/test.py +++ b/data/tasks/product_review_finder/test.py @@ -14,45 +14,53 @@ write_test_result, ) -# Define Semantic Test 1: Check Multi-Stage Architecture - -AGENT_SDK_DEFINITION = """The solution should use an Agent SDK, such as Claude Agent/Code SDK, Microsoft Agent Framework, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI Codex CLI, or others that are similarly capable. These SDKs must have the following functionality: -- Automatic Context Management to ensure your agent doesn't run out of context. -- Rich tool ecosystem: File operations, code execution, web search, and MCP extensibility -- Excels at code generation and effectively gives the agent a "computer" where it can find appropriate files, write and edit files, lint the code, run it, debug, edit, and sometimes take these actions iteratively until it succeeds. -- APIs like OpenAI's chat completions or Responses API, Anthropic's Messages API, or Azure OpenAI alone are NOT sufficient and should not recieve any credit.""" - -STEPS_1_ARCHITECTURE = f"""{AGENT_SDK_DEFINITION} - -1. Explore the code for the product review tool to understand the architecture. -2. Check for required dependencies: - - Look for dependency files (pyproject.toml, requirements.txt, etc.) - - Verify the code uses an Agent SDK (see definition above) for LLM interactions - - Verify the code uses ddgs (Dux Distributed Global Search) for finding reviews - - Check that these dependencies are actually imported and used in the code - - Confirm the AI framework is an Agent SDK with required capabilities, not just a plain API client -3. Look for evidence of the required multi-stage pipeline: - - A writer component that creates the initial draft - - An accuracy-reviewer that checks for proper citations and hallucinations - - A completeness-reviewer that checks for all required sections - - A synthesis-reviewer that checks for coherent analysis -4. Check if the code has logic for: - - Passing drafts between components with feedback - - Looping back to writer when reviewers find issues - - Re-running all previous reviewers after writer makes changes -5. Check if there's support for user feedback with [bracket-enclosed-comments].""" - -RUBRIC_1_ARCHITECTURE = { - "agent_sdk_identified": "str - Name of Agent SDK used, or 'None'", - "agent_sdk_dependency": "str - (15 points) Does solution use qualifying Agent SDK (Claude Agent/Code SDK, Microsoft Agent Framework, Amplifier, OpenAI Codex CLI) for LLM interactions? Must provide automatic context management, rich tool ecosystem, and iterative code capabilities. NOT plain API clients.", - "ddgs_dependency": "str - (10 points) Does the solution use ddgs (Dux Distributed Global Search) for finding reviews? Check both dependency files and actual imports/usage in code.", - "writer_component": "str - (12 points) Is there a clear writer component that creates the initial markdown report draft?", - "accuracy_reviewer": "str - (12 points) Is there an accuracy-reviewer component that validates citations and checks for hallucinations?", - "completeness_reviewer": "str - (12 points) Is there a completeness-reviewer that checks for all required sections?", - "synthesis_reviewer": "str - (12 points) Is there a synthesis-reviewer that validates coherent analysis and recommendations?", - "feedback_loops": "str - (12 points) Does the code implement feedback loops where reviewers can send work back to the writer?", - "sequential_review": "str - (10 points) After writer changes, do drafts go back through all previous reviewers (not just the one that failed)?", +# Define Semantic Test 1: Capabilities and Architecture + +AGENT_SDK_NOTE = """\ +NOTE: The solution may use either direct LLM API calls (OpenAI, Anthropic) or an Agent SDK such as \ +Claude Agent/Code SDK, Microsoft Amplifier (https://github.com/microsoft/amplifier/tree/next), OpenAI \ +Codex CLI, or others that are similarly capable. Both approaches are acceptable. If an Agent SDK is used, \ +its built-in LLM capabilities count as satisfying the LLM usage checks for each stage below.""" + +STEPS_1_CAPABILITIES = f"""\ +{AGENT_SDK_NOTE} + +1. Explore the code under scenarios/product_review_finder/ to understand the full implementation. +2. Check that the solution uses **ddgs** (Dux Distributed Global Search, \ +https://pypi.org/project/ddgs/) for finding reviews. Verify it is listed in dependency files and \ +actually imported and used in the code. +3. Check for a **writer** component that uses an LLM to create the initial markdown report draft \ +(consensus, strengths, weaknesses, ratings, quotes, citations, recommendation). +4. Check for an **accuracy-reviewer** that uses an LLM to verify all claims are properly cited and \ +nothing is misrepresented or fabricated. +5. Check for a **completeness-reviewer** that uses an LLM to check that all required sections are \ +present and comprehensive enough. +6. Check for a **synthesis-reviewer** that uses an LLM to verify the report provides coherent analysis \ +across sources and actionable recommendations, not just listing facts. +7. Check for **feedback loops**: when a reviewer finds issues, the draft goes back to the writer for \ +fixes and then back through the reviewers. +8. Check for **sequential review**: after the writer makes changes, the draft goes back through all \ +previous reviewers (not just the one that failed). +9. Check for support for **user feedback** with [bracket-enclosed-comments] that triggers another \ +write-and-review cycle. +10. Check for a **--category flag** (optional) to narrow down search results. +11. Verify the solution uses a recent, up-to-date LLM from Anthropic or OpenAI. If an Agent SDK is \ +used, check which model it is configured with. Check the model identifier in the code against these \ +references: + - Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview + - OpenAI models: https://platform.openai.com/docs/models""" + +RUBRIC_1_CAPABILITIES = { + "ddgs_dependency": "str - (10 points) Does the solution use ddgs for finding reviews? Check both dependency files and actual imports/usage in code.", + "writer_uses_llm": "str - (10 points) Is there a writer component that uses an LLM to create the initial markdown report draft? Agent SDKs with built-in LLM capabilities count.", + "accuracy_reviewer_uses_llm": "str - (10 points) Is there an accuracy-reviewer that uses an LLM to validate citations and check for hallucinations? Agent SDKs with built-in LLM capabilities count.", + "completeness_reviewer_uses_llm": "str - (10 points) Is there a completeness-reviewer that uses an LLM to check for all required sections? Agent SDKs with built-in LLM capabilities count.", + "synthesis_reviewer_uses_llm": "str - (10 points) Is there a synthesis-reviewer that uses an LLM to validate coherent analysis and recommendations? Agent SDKs with built-in LLM capabilities count.", + "feedback_loops": "str - (10 points) Does the code implement feedback loops where reviewers can send work back to the writer?", + "sequential_review": "str - (5 points) After writer changes, do drafts go back through all previous reviewers (not just the one that failed)?", "user_feedback": "str - (5 points) Is there support for user feedback with bracket-enclosed comments?", + "category_flag": "str - (5 points) Is there a --category flag to narrow down search results?", + "uses_recent_model": "str - (25 points) Does it use a recent model from Anthropic (see https://platform.claude.com/docs/en/about-claude/models/overview) or OpenAI (see https://platform.openai.com/docs/models)? If an Agent SDK is used, check which model it is configured with. 5 points partial credit if a model is used but it is not recent.", "score": "float - Score between 0 and 100 based on the above criteria. Sum the points earned from each criterion.", } @@ -126,10 +134,10 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non instructions = get_instructions_from_file_or_default(instructions_file=instructions_file) try: - logger.info("Running semantic test 1: Checking multi-stage architecture...") + logger.info("Running semantic test 1: Checking capabilities and architecture...") result_1 = await semantic_test( - steps=STEPS_1_ARCHITECTURE, - rubric=RUBRIC_1_ARCHITECTURE, + steps=STEPS_1_CAPABILITIES, + rubric=RUBRIC_1_CAPABILITIES, context=instructions, working_dir=Path("/project"), ) @@ -148,7 +156,7 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non metadata = { "instructions": instructions, - "semantic_test_1_architecture": { + "semantic_test_1_capabilities": { "score": result_1.score, "details": result_1.metadata, }, @@ -158,7 +166,7 @@ async def run_test(test_id: str, output_dir: Path, instructions_file: Path | Non }, "final_score": final_score, "scoring_weights": { - "architecture": "40%", + "capabilities": "40%", "run_and_validate": "60%", }, } diff --git a/data/tasks/repo_embedding_server/test.py b/data/tasks/repo_embedding_server/test.py index 9396d43..7026fbf 100644 --- a/data/tasks/repo_embedding_server/test.py +++ b/data/tasks/repo_embedding_server/test.py @@ -26,18 +26,18 @@ 3. Verify the following dependencies are installed and used: - FastAPI for the web server - ChromaDB Python library for vector storage - - Ollama client library (`ollama` in Python) 4. Check the code to confirm: - - The Ollama model `{EXPECTED_OLLAMA_MODEL}` is specified - - ChromaDB is used directly as a Python library (not running as a separate service) with either disk-based or in-memory storage + - The Ollama model `{EXPECTED_OLLAMA_MODEL}` is used for embeddings (via the ollama Python library, \ +direct HTTP calls to Ollama's local API, or any other method of calling Ollama) + - ChromaDB is used directly as a Python library (not running as a separate service) with either \ +disk-based or in-memory storage - The server reads from a file called repos.txt 5. Verify basic server structure exists (FastAPI app, routes/endpoints defined).""" RUBRIC_1_DEPENDENCIES_AND_ARCHITECTURE = { - "fastapi_dependency": "str - (20 points) Is FastAPI installed and used for the web server?", - "chromadb_dependency": "str - (20 points) Is ChromaDB Python library installed and used for vector storage?", - "ollama_client_dependency": "str - (15 points) Is an Ollama client library installed and used?", - "correct_ollama_model": f"str - (15 points) Is the model `{EXPECTED_OLLAMA_MODEL}` specified in the code?", + "fastapi_dependency": "str - (25 points) Is FastAPI installed and used for the web server?", + "chromadb_dependency": "str - (25 points) Is ChromaDB Python library installed and used for vector storage?", + "correct_ollama_model": f"str - (20 points) Is the Ollama model `{EXPECTED_OLLAMA_MODEL}` specified in the code and used for generating embeddings?", "chromadb_usage": "str - (15 points) Is ChromaDB used directly as a library (not as a separate service) with disk and/or in-memory storage?", "repos_txt_reading": "str - (10 points) Does the code read from a repos.txt file?", "server_structure": "str - (5 points) Does a basic FastAPI server structure with endpoints exist?", diff --git a/data/tasks/sec_10q_extractor/task.yaml b/data/tasks/sec_10q_extractor/task.yaml index bcf0a19..23786ea 100644 --- a/data/tasks/sec_10q_extractor/task.yaml +++ b/data/tasks/sec_10q_extractor/task.yaml @@ -4,14 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - finance - - document_processing - - data_extraction + - knowledge_worker evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | Build a CLI tool that takes a company name and date (month and year) and extracts financial data from their SEC 10-Q filing. diff --git a/data/tasks/style_blender/task.yaml b/data/tasks/style_blender/task.yaml index 8a79c9c..4dfba43 100644 --- a/data/tasks/style_blender/task.yaml +++ b/data/tasks/style_blender/task.yaml @@ -4,12 +4,11 @@ task_info: non_deterministic_evals: true categories: - cli_tool - - writing + - ai_tool evaluation_configs: - type: score test_script: test.py test_command: uv run --no-project /project/test.py -dockerfile_portion: "" instructions: | Create a tool called "Style Blender" that blends writing styles from multiple authors to create unique voice profiles. The tool should analyze writing samples from 2+ authors, extract their individual style characteristics, then intelligently blend them to create a new, harmonious writing voice. Place all the code under the directory scenarios/style_blender. diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index 2791135..6626699 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -8,10 +8,10 @@ This module provides a benchmarking harness for evaluating AI agents within isol ```bash # Install prerequisites below first. # With uv (add to project dependencies, pinned to a release tag) -uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.35" +uv add "eval-recipes @ git+https://github.com/microsoft/eval-recipes@v0.0.36" # With pip -pip install "git+https://github.com/microsoft/eval-recipes@v0.0.35" +pip install "git+https://github.com/microsoft/eval-recipes@v0.0.36" ``` diff --git a/pyproject.toml b/pyproject.toml index d7224e2..6ca53e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "eval_recipes" -version = "0.0.35" +version = "0.0.36" description = "Eval Recipes" authors = [{ name = "Microsoft MADE:Explorations Team" }] readme = "README.md" diff --git a/scripts/third_party_benchmarks/setup_frontier_science.py b/scripts/third_party_benchmarks/setup_frontier_science.py index 8b6fb23..99b7a93 100644 --- a/scripts/third_party_benchmarks/setup_frontier_science.py +++ b/scripts/third_party_benchmarks/setup_frontier_science.py @@ -16,6 +16,7 @@ import urllib.request import click +import yaml HUGGINGFACE_API_URL = "https://datasets-server.huggingface.co/rows" DATASET_NAME = "openai/frontierscience" @@ -41,8 +42,8 @@ import click -from eval_recipes.benchmarking.semantic_test import semantic_test -from eval_recipes.benchmarking.test_utils import ( +from eval_recipes.benchmarking.evaluation.semantic_test import semantic_test +from eval_recipes.benchmarking.evaluation.test_utils import ( get_instructions_from_file_or_default, get_test_id_from_env_or_default, write_test_result, @@ -118,15 +119,26 @@ async def async_main(test_id: str, output_dir: Path, instructions_file: Path | N sys.exit(main()) ''' -TASK_YAML_TEMPLATE = """\ -required_env_vars: - - ANTHROPIC_API_KEY -task_info: - difficulty: hard - non_deterministic_evals: true - categories: - - openai-frontier-science -""" + +def generate_task_yaml(task_name: str, instructions: str) -> str: + """Generate task.yaml content using yaml.dump for safe serialization.""" + task_data = { + "name": task_name, + "task_info": { + "difficulty": "hard", + "non_deterministic_evals": True, + "categories": ["openai-frontier-science"], + }, + "evaluation_configs": [ + { + "type": "score", + "test_script": "test.py", + "test_command": "uv run --no-project /project/test.py", + } + ], + "instructions": instructions, + } + return yaml.dump(task_data, default_flow_style=False, sort_keys=False, allow_unicode=True) def fetch_all_rows() -> list[dict]: @@ -192,32 +204,28 @@ def generate_test_py(task_id: str, row: dict) -> str: def create_task(task_id: str, row: dict, output_dir: Path) -> None: """Create a benchmark task directory with all required files.""" - task_dir = output_dir / f"frontier-science-{task_id}" + task_name = f"frontier-science-{task_id}" + task_dir = output_dir / task_name task_dir.mkdir(parents=True, exist_ok=True) - instructions_file = task_dir / "instructions.txt" - instructions_file.write_text(generate_instructions(row)) + instructions = generate_instructions(row) test_file = task_dir / "test.py" - test_file.write_text(generate_test_py(task_id, row)) + test_file.write_text(generate_test_py(task_id, row), encoding="utf-8") yaml_file = task_dir / "task.yaml" - yaml_file.write_text(TASK_YAML_TEMPLATE) + yaml_file.write_text(generate_task_yaml(task_name, instructions), encoding="utf-8") print(f" Created task: {task_dir.name}") def clean_existing_tasks(output_dir: Path) -> int: """Remove existing FrontierScience tasks from the output directory.""" - removed_count = 0 if not output_dir.exists(): - return removed_count - - for task_dir in output_dir.iterdir(): - if task_dir.is_dir() and task_dir.name.startswith("frontier-science-"): - shutil.rmtree(task_dir) - removed_count += 1 + return 0 + removed_count = sum(1 for d in output_dir.iterdir() if d.is_dir()) + shutil.rmtree(output_dir) return removed_count @@ -237,8 +245,8 @@ def clean_existing_tasks(output_dir: Path) -> int: @click.option( "--output-dir", type=click.Path(path_type=Path), - default=Path(__file__).parents[2] / "data" / "tasks", - help="Directory to create tasks in (default: data/tasks)", + default=Path(__file__).parents[2] / "data" / "tasks" / "frontier-science", + help="Directory to create tasks in (default: data/tasks/frontier-science)", ) @click.option( "--clean", diff --git a/src/eval_recipes/benchmarking/jobs/score/task_analysis_job.py b/src/eval_recipes/benchmarking/jobs/score/task_analysis_job.py index 64cde9d..a973be3 100644 --- a/src/eval_recipes/benchmarking/jobs/score/task_analysis_job.py +++ b/src/eval_recipes/benchmarking/jobs/score/task_analysis_job.py @@ -3,7 +3,6 @@ """Job for analyzing completed trials and generating failure reports.""" import json -import os from pathlib import Path from typing import Any @@ -284,8 +283,7 @@ def _run_failure_analysis( log_filename="analysis_agent.log", timeout=900, environment={ - "ANTHROPIC_API_KEY": os.environ.get("ANTHROPIC_API_KEY", ""), - "PYTHONPATH": "/project/_eval_recipes", # So Python finds the copied eval_recipes package + "PYTHONPATH": "/project/_eval_recipes", }, workdir="/project", ) diff --git a/uv.lock b/uv.lock index e32a038..8312d07 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.13'", @@ -744,7 +744,7 @@ wheels = [ [[package]] name = "eval-recipes" -version = "0.0.35" +version = "0.0.36" source = { editable = "." } dependencies = [ { name = "azure-core", extra = ["aio"] },