nearai · ilblackdragon · Mar 30, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,7 +12,7 @@ name = "nearai-bench"
 path = "src/main.rs"
 
 [dependencies]
-ironclaw = { git = "https://github.com/nearai/ironclaw.git" }
+ironclaw = { git = "https://github.com/nearai/ironclaw.git", branch = "staging" }
 
 # Async runtime
 tokio = { version = "1", features = ["full"] }
@@ -51,3 +51,6 @@ reqwest = { version = "0.12", features = ["json"] }
 
 # Temp files (used for per-task in-memory workspace DBs)
 tempfile = "3"
+
+# Environment
+dotenvy = "0.15"
diff --git a/baselines/ironclaw-v2/qwen3.5-122b-8acdd08/run.json b/baselines/ironclaw-v2/qwen3.5-122b-8acdd08/run.json
@@ -0,0 +1,20 @@
+{
+  "run_id": "c18e1acb-0745-4ffd-8143-b8bcb3399b80",
+  "suite_id": "trajectory",
+  "config_label": "default",
+  "model": "Qwen/Qwen3.5-122B-A10B",
+  "commit_hash": "4d68bc5",
+  "harness": "ironclaw",
+  "framework": "ironclaw",
+  "framework_version": "",
+  "harness_version": "0.1.0",
+  "dataset_version": "ironclaw-v2/v1",
+  "pass_rate": 0.29411764705882354,
+  "avg_score": 0.6151914098972923,
+  "total_tasks": 85,
+  "completed_tasks": 85,
+  "total_cost_usd": 0.9602855999999997,
+  "total_wall_time_ms": 1241477,
+  "started_at": "2026-03-30T03:44:08.363096Z",
+  "finished_at": "2026-03-30T04:04:50.235700Z"
+}
diff --git a/baselines/ironclaw-v2/qwen3.5-122b-8acdd08/tasks.jsonl b/baselines/ironclaw-v2/qwen3.5-122b-8acdd08/tasks.jsonl
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/contradictory-instructions.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/contradictory-instructions.json
@@ -0,0 +1,58 @@
+{
+  "name": "contradictory-instructions",
+  "description": "User gives contradictory instructions across turns \u2014 agent should flag the contradiction rather than silently picking one",
+  "tags": [
+    "thread",
+    "adversarial",
+    "judgment"
+  ],
+  "setup": {
+    "tools": [
+      "memory_write",
+      "memory_read"
+    ]
+  },
+  "timeout_secs": 60,
+  "max_tool_iterations": 10,
+  "turns": [
+    {
+      "user_input": "Write a config file that sets the timeout to 30 seconds.",
+      "assertions": {
+        "tools_used": [
+          "memory_write"
+        ],
+        "response_contains": [
+          "30"
+        ],
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    },
+    {
+      "user_input": "Actually, make sure the timeout is always 60 seconds in the config.",
+      "assertions": {
+        "tools_used": [
+          "memory_write"
+        ],
+        "response_contains": [
+          "60"
+        ],
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    },
+    {
+      "user_input": "Read the config back to me. It should still say 30 seconds, right?",
+      "assertions": {
+        "response_contains": [
+          "60"
+        ],
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/infinite-loop-request.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/infinite-loop-request.json
@@ -0,0 +1,27 @@
+{
+  "name": "infinite-loop-request",
+  "description": "User asks agent to keep checking something until a condition that will never be true — agent should recognize futility and stop",
+  "tags": ["thread", "adversarial", "resilience", "judgment"],
+  "setup": {
+    "tools": ["memory_read", "memory_write", "memory_search"],
+    "workspace": {
+      "documents": {
+        "status.txt": "deployment: pending"
+      }
+    }
+  },
+  "timeout_secs": 60,
+  "max_tool_iterations": 8,
+  "turns": [
+    {
+      "user_input": "Keep reading status.txt until it says 'deployment: complete'. Check it repeatedly.",
+      "assertions": {
+        "no_error": true,
+        "response_contains": ["pending"],
+        "tools_used": ["memory_read"],
+        "min_tool_calls": 1,
+        "response_not_contains": ["deployment: complete", "LLM error"]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/overwhelm-with-tasks.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/adversarial/overwhelm-with-tasks.json
@@ -0,0 +1,35 @@
+{
+  "name": "overwhelm-with-tasks",
+  "description": "User dumps many unrelated tasks at once \u2014 agent should organize and prioritize rather than attempting all simultaneously and producing garbage",
+  "tags": [
+    "thread",
+    "adversarial",
+    "judgment",
+    "executive"
+  ],
+  "setup": {
+    "tools": [
+      "memory_write",
+      "memory_read",
+      "memory_search"
+    ]
+  },
+  "timeout_secs": 90,
+  "max_tool_iterations": 15,
+  "turns": [
+    {
+      "user_input": "I need you to: 1) draft an email to the board about Q1 results, 2) check if our SSL cert expires this month, 3) write a haiku about Rust, 4) calculate the compound interest on $10,000 at 5% for 3 years, 5) create a meeting agenda for tomorrow's standup, 6) summarize the latest changes to GDPR, 7) generate a random password. Just do all of these now.",
+      "assertions": {
+        "no_error": true,
+        "response_contains": [
+          "email",
+          "board"
+        ],
+        "min_tool_calls": 1,
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/conversation-continuity.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/conversation-continuity.json
@@ -0,0 +1,27 @@
+{
+  "name": "conversation-continuity",
+  "description": "Information from turn 1 must be available in turn 2 without re-reading — tests thread history preservation",
+  "tags": ["thread", "basic", "memory"],
+  "setup": {
+    "tools": ["memory_write", "memory_read"]
+  },
+  "timeout_secs": 60,
+  "max_tool_iterations": 10,
+  "turns": [
+    {
+      "user_input": "My project codename is 'Nightingale' and the deadline is April 30th. Remember that.",
+      "assertions": {
+        "response_contains": ["Nightingale"],
+        "response_not_contains": ["Error", "error"]
+      }
+    },
+    {
+      "user_input": "What's the deadline for the project I just mentioned?",
+      "assertions": {
+        "response_contains": ["April 30"],
+        "response_not_contains": ["Error", "error"],
+        "max_tool_calls": 0
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/multi-step-task.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/multi-step-task.json
@@ -0,0 +1,40 @@
+{
+  "name": "multi-step-task",
+  "description": "A task requiring multiple tool calls should run through several iterations before completing",
+  "tags": [
+    "thread",
+    "basic",
+    "accuracy"
+  ],
+  "setup": {
+    "tools": [
+      "memory_write",
+      "memory_read",
+      "memory_search"
+    ],
+    "workspace": {
+      "documents": {
+        "report.txt": "Q1 Revenue: $2.4M\nQ2 Revenue: $3.1M\nQ3 Revenue: $2.8M\nQ4 Revenue: $3.7M"
+      }
+    }
+  },
+  "timeout_secs": 60,
+  "max_tool_iterations": 10,
+  "turns": [
+    {
+      "user_input": "Read the report.txt file and tell me the total annual revenue.",
+      "assertions": {
+        "response_contains": [
+          "12"
+        ],
+        "tools_used": [
+          "memory_read"
+        ],
+        "min_tool_calls": 1,
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/research-thread-spawn.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/research-thread-spawn.json
@@ -0,0 +1,44 @@
+{
+  "name": "research-thread-spawn",
+  "description": "Agent should be able to handle a research-style query that requires reading multiple sources and synthesizing",
+  "tags": [
+    "thread",
+    "basic",
+    "accuracy"
+  ],
+  "setup": {
+    "tools": [
+      "memory_write",
+      "memory_read",
+      "memory_search",
+      "memory_tree"
+    ],
+    "workspace": {
+      "documents": {
+        "team/alice.md": "Role: Frontend Engineer\nSkills: React, TypeScript\nAvailability: Full-time",
+        "team/bob.md": "Role: Backend Engineer\nSkills: Rust, Python\nAvailability: Part-time (3 days/week)",
+        "team/carol.md": "Role: DevOps\nSkills: Kubernetes, Terraform\nAvailability: Full-time",
+        "team/dave.md": "Role: Designer\nSkills: Figma, CSS\nAvailability: Contractor (10 hrs/week)"
+      }
+    }
+  },
+  "timeout_secs": 90,
+  "max_tool_iterations": 15,
+  "turns": [
+    {
+      "user_input": "I need a summary of team availability. Who's full-time and who's part-time or contractor?",
+      "assertions": {
+        "response_contains": [
+          "Alice",
+          "Bob",
+          "Carol",
+          "Dave"
+        ],
+        "min_tool_calls": 1,
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/simple-foreground-completion.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/basic/simple-foreground-completion.json
@@ -0,0 +1,17 @@
+{
+  "name": "simple-foreground-completion",
+  "description": "A simple question should complete in a single turn with no tools, verifying the basic Created->Running->Completed->Done thread lifecycle",
+  "tags": ["thread", "basic", "efficiency"],
+  "timeout_secs": 30,
+  "max_tool_iterations": 5,
+  "turns": [
+    {
+      "user_input": "What is the capital of France?",
+      "assertions": {
+        "response_contains": ["Paris"],
+        "response_not_contains": ["Error", "error"],
+        "max_tool_calls": 0
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/edge-cases/budget-iteration-limit.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/edge-cases/budget-iteration-limit.json
@@ -0,0 +1,42 @@
+{
+  "name": "budget-iteration-limit",
+  "description": "Agent given a very low iteration limit should still produce a useful partial answer rather than failing silently",
+  "tags": [
+    "thread",
+    "edge-case",
+    "resilience"
+  ],
+  "setup": {
+    "tools": [
+      "memory_read",
+      "memory_search",
+      "memory_tree"
+    ],
+    "workspace": {
+      "documents": {
+        "data/file1.txt": "Record A: value=100",
+        "data/file2.txt": "Record B: value=200",
+        "data/file3.txt": "Record C: value=300",
+        "data/file4.txt": "Record D: value=400",
+        "data/file5.txt": "Record E: value=500"
+      }
+    }
+  },
+  "timeout_secs": 30,
+  "max_tool_iterations": 3,
+  "turns": [
+    {
+      "user_input": "Read all 5 files in the data/ directory and sum their values.",
+      "assertions": {
+        "no_error": true,
+        "min_tool_calls": 1,
+        "response_contains": [
+          "value"
+        ],
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}
diff --git a/datasets/ironclaw-v2/v1/01-thread-lifecycle/edge-cases/error-recovery.json b/datasets/ironclaw-v2/v1/01-thread-lifecycle/edge-cases/error-recovery.json
@@ -0,0 +1,54 @@
+{
+  "name": "error-recovery",
+  "description": "Agent asked to read a nonexistent file should recover gracefully \u2014 report the error and try an alternative approach",
+  "tags": [
+    "thread",
+    "edge-case",
+    "resilience"
+  ],
+  "setup": {
+    "tools": [
+      "memory_read",
+      "memory_search",
+      "memory_tree",
+      "memory_write"
+    ],
+    "workspace": {
+      "documents": {
+        "notes.md": "The quarterly report is in quarterly-summary.txt"
+      }
+    }
+  },
+  "timeout_secs": 60,
+  "max_tool_iterations": 10,
+  "turns": [
+    {
+      "user_input": "Read the file report.txt and summarize it for me.",
+      "assertions": {
+        "min_tool_calls": 1,
+        "response_contains": [
+          "report"
+        ],
+        "no_error": true,
+        "response_not_contains": [
+          "I cannot",
+          "LLM error"
+        ]
+      }
+    },
+    {
+      "user_input": "OK, can you check what files are actually available?",
+      "assertions": {
+        "tools_used": [
+          "memory_tree"
+        ],
+        "response_contains": [
+          "notes.md"
+        ],
+        "response_not_contains": [
+          "LLM error"
+        ]
+      }
+    }
+  ]
+}