Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,248 changes: 1,189 additions & 59 deletions Cargo.lock

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ name = "nearai-bench"
path = "src/main.rs"

[dependencies]
ironclaw = { git = "https://github.com/nearai/ironclaw.git" }
ironclaw = { git = "https://github.com/nearai/ironclaw.git", branch = "staging" }

# Async runtime
tokio = { version = "1", features = ["full"] }
Expand Down Expand Up @@ -51,3 +51,6 @@ reqwest = { version = "0.12", features = ["json"] }

# Temp files (used for per-task in-memory workspace DBs)
tempfile = "3"

# Environment
dotenvy = "0.15"
20 changes: 20 additions & 0 deletions baselines/ironclaw-v2/qwen3.5-122b-8acdd08/run.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"run_id": "c18e1acb-0745-4ffd-8143-b8bcb3399b80",
"suite_id": "trajectory",
"config_label": "default",
"model": "Qwen/Qwen3.5-122B-A10B",
"commit_hash": "4d68bc5",
"harness": "ironclaw",
"framework": "ironclaw",
"framework_version": "",
"harness_version": "0.1.0",
"dataset_version": "ironclaw-v2/v1",
"pass_rate": 0.29411764705882354,
"avg_score": 0.6151914098972923,
"total_tasks": 85,
"completed_tasks": 85,
"total_cost_usd": 0.9602855999999997,
"total_wall_time_ms": 1241477,
"started_at": "2026-03-30T03:44:08.363096Z",
"finished_at": "2026-03-30T04:04:50.235700Z"
}
85 changes: 85 additions & 0 deletions baselines/ironclaw-v2/qwen3.5-122b-8acdd08/tasks.jsonl

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"name": "contradictory-instructions",
"description": "User gives contradictory instructions across turns \u2014 agent should flag the contradiction rather than silently picking one",
"tags": [
"thread",
"adversarial",
"judgment"
],
"setup": {
"tools": [
"memory_write",
"memory_read"
]
},
"timeout_secs": 60,
"max_tool_iterations": 10,
"turns": [
{
"user_input": "Write a config file that sets the timeout to 30 seconds.",
"assertions": {
"tools_used": [
"memory_write"
],
"response_contains": [
"30"
],
"response_not_contains": [
"LLM error"
]
}
},
{
"user_input": "Actually, make sure the timeout is always 60 seconds in the config.",
"assertions": {
"tools_used": [
"memory_write"
],
"response_contains": [
"60"
],
"response_not_contains": [
"LLM error"
]
}
},
{
"user_input": "Read the config back to me. It should still say 30 seconds, right?",
"assertions": {
"response_contains": [
"60"
],
"response_not_contains": [
"LLM error"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "infinite-loop-request",
"description": "User asks agent to keep checking something until a condition that will never be true — agent should recognize futility and stop",
"tags": ["thread", "adversarial", "resilience", "judgment"],
"setup": {
"tools": ["memory_read", "memory_write", "memory_search"],
"workspace": {
"documents": {
"status.txt": "deployment: pending"
}
}
},
"timeout_secs": 60,
"max_tool_iterations": 8,
"turns": [
{
"user_input": "Keep reading status.txt until it says 'deployment: complete'. Check it repeatedly.",
"assertions": {
"no_error": true,
"response_contains": ["pending"],
"tools_used": ["memory_read"],
"min_tool_calls": 1,
"response_not_contains": ["deployment: complete", "LLM error"]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "overwhelm-with-tasks",
"description": "User dumps many unrelated tasks at once \u2014 agent should organize and prioritize rather than attempting all simultaneously and producing garbage",
"tags": [
"thread",
"adversarial",
"judgment",
"executive"
],
"setup": {
"tools": [
"memory_write",
"memory_read",
"memory_search"
]
},
"timeout_secs": 90,
"max_tool_iterations": 15,
"turns": [
{
"user_input": "I need you to: 1) draft an email to the board about Q1 results, 2) check if our SSL cert expires this month, 3) write a haiku about Rust, 4) calculate the compound interest on $10,000 at 5% for 3 years, 5) create a meeting agenda for tomorrow's standup, 6) summarize the latest changes to GDPR, 7) generate a random password. Just do all of these now.",
"assertions": {
"no_error": true,
"response_contains": [
"email",
"board"
],
"min_tool_calls": 1,
"response_not_contains": [
"LLM error"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"name": "conversation-continuity",
"description": "Information from turn 1 must be available in turn 2 without re-reading — tests thread history preservation",
"tags": ["thread", "basic", "memory"],
"setup": {
"tools": ["memory_write", "memory_read"]
},
"timeout_secs": 60,
"max_tool_iterations": 10,
"turns": [
{
"user_input": "My project codename is 'Nightingale' and the deadline is April 30th. Remember that.",
"assertions": {
"response_contains": ["Nightingale"],
"response_not_contains": ["Error", "error"]
}
},
{
"user_input": "What's the deadline for the project I just mentioned?",
"assertions": {
"response_contains": ["April 30"],
"response_not_contains": ["Error", "error"],
"max_tool_calls": 0
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"name": "multi-step-task",
"description": "A task requiring multiple tool calls should run through several iterations before completing",
"tags": [
"thread",
"basic",
"accuracy"
],
"setup": {
"tools": [
"memory_write",
"memory_read",
"memory_search"
],
"workspace": {
"documents": {
"report.txt": "Q1 Revenue: $2.4M\nQ2 Revenue: $3.1M\nQ3 Revenue: $2.8M\nQ4 Revenue: $3.7M"
}
}
},
"timeout_secs": 60,
"max_tool_iterations": 10,
"turns": [
{
"user_input": "Read the report.txt file and tell me the total annual revenue.",
"assertions": {
"response_contains": [
"12"
],
"tools_used": [
"memory_read"
],
"min_tool_calls": 1,
"response_not_contains": [
"LLM error"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"name": "research-thread-spawn",
"description": "Agent should be able to handle a research-style query that requires reading multiple sources and synthesizing",
"tags": [
"thread",
"basic",
"accuracy"
],
"setup": {
"tools": [
"memory_write",
"memory_read",
"memory_search",
"memory_tree"
],
"workspace": {
"documents": {
"team/alice.md": "Role: Frontend Engineer\nSkills: React, TypeScript\nAvailability: Full-time",
"team/bob.md": "Role: Backend Engineer\nSkills: Rust, Python\nAvailability: Part-time (3 days/week)",
"team/carol.md": "Role: DevOps\nSkills: Kubernetes, Terraform\nAvailability: Full-time",
"team/dave.md": "Role: Designer\nSkills: Figma, CSS\nAvailability: Contractor (10 hrs/week)"
}
}
},
"timeout_secs": 90,
"max_tool_iterations": 15,
"turns": [
{
"user_input": "I need a summary of team availability. Who's full-time and who's part-time or contractor?",
"assertions": {
"response_contains": [
"Alice",
"Bob",
"Carol",
"Dave"
],
"min_tool_calls": 1,
"response_not_contains": [
"LLM error"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "simple-foreground-completion",
"description": "A simple question should complete in a single turn with no tools, verifying the basic Created->Running->Completed->Done thread lifecycle",
"tags": ["thread", "basic", "efficiency"],
"timeout_secs": 30,
"max_tool_iterations": 5,
"turns": [
{
"user_input": "What is the capital of France?",
"assertions": {
"response_contains": ["Paris"],
"response_not_contains": ["Error", "error"],
"max_tool_calls": 0
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"name": "budget-iteration-limit",
"description": "Agent given a very low iteration limit should still produce a useful partial answer rather than failing silently",
"tags": [
"thread",
"edge-case",
"resilience"
],
"setup": {
"tools": [
"memory_read",
"memory_search",
"memory_tree"
],
"workspace": {
"documents": {
"data/file1.txt": "Record A: value=100",
"data/file2.txt": "Record B: value=200",
"data/file3.txt": "Record C: value=300",
"data/file4.txt": "Record D: value=400",
"data/file5.txt": "Record E: value=500"
}
}
},
"timeout_secs": 30,
"max_tool_iterations": 3,
"turns": [
{
"user_input": "Read all 5 files in the data/ directory and sum their values.",
"assertions": {
"no_error": true,
"min_tool_calls": 1,
"response_contains": [
"value"
],
"response_not_contains": [
"LLM error"
]
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name": "error-recovery",
"description": "Agent asked to read a nonexistent file should recover gracefully \u2014 report the error and try an alternative approach",
"tags": [
"thread",
"edge-case",
"resilience"
],
"setup": {
"tools": [
"memory_read",
"memory_search",
"memory_tree",
"memory_write"
],
"workspace": {
"documents": {
"notes.md": "The quarterly report is in quarterly-summary.txt"
}
}
},
"timeout_secs": 60,
"max_tool_iterations": 10,
"turns": [
{
"user_input": "Read the file report.txt and summarize it for me.",
"assertions": {
"min_tool_calls": 1,
"response_contains": [
"report"
],
"no_error": true,
"response_not_contains": [
"I cannot",
"LLM error"
]
}
},
{
"user_input": "OK, can you check what files are actually available?",
"assertions": {
"tools_used": [
"memory_tree"
],
"response_contains": [
"notes.md"
],
"response_not_contains": [
"LLM error"
]
}
}
]
}
Loading