PrimeIntellect-ai · mikasenghaas · May 25, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -50,11 +50,10 @@ Write tests as plain functions with pytest fixtures. Don't use class-based tests
 
 ## Git
 
-- **Branch prefixes**: use the following prefixes for branches: `feat/`, `fix/`, `chore/`
+- **Branch prefixes**: use `feat/`, `fix/`, `chore/`; use `exp/` for experiment branches (configs, run summaries, pins, notes).
 
 ## GitHub
 
 - **Draft PRs**: always create PRs as drafts (`gh pr create --draft`) to avoid triggering CI unnecessarily.
 - **Pull requests**: do not include a "test plan" section in PR descriptions unless you actually ran tests to verify the changes or the user explicitly asked for one.
 - **Keep PR descriptions in sync**: every time you push commits to a PR, also update the PR description (`gh pr edit <num> --body-file ...`) so it reflects the current state of the branch — not just what was true when the PR was opened. Preserve any auto-generated blocks (e.g. `<!-- CURSOR_SUMMARY -->`).
-
diff --git a/configs/general_agent/rl_qwen3_0p6b.toml b/configs/general_agent/rl_qwen3_0p6b.toml
@@ -0,0 +1,29 @@
+max_steps = 5
+seq_len = 8192
+
+[wandb]
+project = "general-agent-debug"
+name = "qwen3-0p6b-rlm"
+
+[model]
+name = "Qwen/Qwen3-0.6B"
+
+[orchestrator]
+batch_size = 16
+rollouts_per_example = 4
+
+[orchestrator.train.sampling]
+max_completion_tokens = 4096
+
+[[orchestrator.train.env]]
+id = "general-agent-solver-rlm"
+
+[trainer]
+
+[inference]
+
+[inference.model]
+max_model_len = 8192
+
+[inference.parallel]
+dp = 1
diff --git a/configs/general_agent/rl_qwen3_30b_a3b.toml b/configs/general_agent/rl_qwen3_30b_a3b.toml
@@ -0,0 +1,52 @@
+max_steps = 400
+seq_len = 32768
+
+[slurm]
+job_name = "general-agent-qwen3-30b-a3b-rlm"
+
+[deployment]
+type = "multi_node"
+num_train_nodes = 1
+num_infer_nodes = 1
+
+[wandb]
+project = "general-agent-debug"
+name = "qwen3-30b-a3b-rlm"
+
+[ckpt]
+interval = 50
+keep_last = 1
+
+[model]
+name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+[trainer]
+
+[trainer.model]
+cp = 2
+
+[trainer.model.ac]
+freq = 1
+
+[trainer.model.compile]
+
+[orchestrator]
+batch_size = 512
+rollouts_per_example = 16
+max_off_policy_steps = 32
+
+[[orchestrator.train.env]]
+id = "general-agent-solver-rlm"
+
+[orchestrator.train.env.args]
+min_tier = 1
+
+[inference]
+gpu_memory_utilization = 0.85
+
+[inference.model]
+max_model_len = 32768
+
+[inference.parallel]
+dp = 2
+tp = 4
diff --git a/configs/general_agent/rl_qwen3_4b.toml b/configs/general_agent/rl_qwen3_4b.toml
@@ -0,0 +1,44 @@
+max_steps = 200
+seq_len = 32768
+
+[deployment]
+num_train_gpus = 4
+num_infer_gpus = 4
+
+[wandb]
+project = "general-agent-debug"
+name = "qwen3-4b-rlm"
+
+[ckpt]
+interval = 100
+keep_last = 1
+
+[model]
+name = "Qwen/Qwen3-4B-Instruct-2507"
+
+[trainer]
+
+[trainer.model]
+cp = 2
+
+[trainer.model.ac]
+freq = 1
+
+[trainer.model.compile]
+
+[orchestrator]
+batch_size = 512
+rollouts_per_example = 8
+max_off_policy_steps = 32
+
+[[orchestrator.train.env]]
+id = "general-agent-solver-rlm"
+
+[inference]
+gpu_memory_utilization = 0.85
+
+[inference.model]
+max_model_len = 32768
+
+[inference.parallel]
+dp = 4
diff --git a/configs/private b/configs/private
diff --git a/deps/research-environments b/deps/research-environments
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ envs = [
     "code-env",
     "color-codeword",
     "deepdive",
+    "general-agent",
     "gpqa",
     "hle",
     "ifeval",
@@ -118,6 +119,7 @@ dev = [
     "ipywidgets>=8.1.7",
     "pre-commit>=4.2.0",
     "pytest>=8.4.1",
+    "pytest-asyncio>=0.23",
     "ruff>=0.12.1",
 ]
 
@@ -137,6 +139,7 @@ members = [
     "deps/research-environments/environments/code_env",
     "deps/research-environments/environments/color_codeword",
     "deps/research-environments/environments/deepdive",
+    "deps/research-environments/environments/general_agent",
     "deps/research-environments/environments/gpqa",
     "deps/research-environments/environments/hle",
     "deps/research-environments/environments/ifeval",
@@ -203,6 +206,7 @@ alphabet-sort = { workspace = true }
 code-env = { workspace = true }
 color-codeword = { workspace = true }
 deepdive = { workspace = true }
+general-agent = { workspace = true }
 gpqa = { workspace = true }
 hle = { workspace = true }
 ifeval = { workspace = true }

diff --git a/tests/unit/test_configs.py b/tests/unit/test_configs.py
@@ -1,3 +1,4 @@
+import tomllib
 from pathlib import Path
 from typing import Annotated, Literal
 
@@ -33,9 +34,19 @@ def get_config_files() -> list[Path]:
     return config_files + example_files
 
 
+def is_eval_config(path: Path) -> bool:
+    """vf-eval TOMLs live under configs but are not prime-rl entrypoint configs."""
+    with path.open("rb") as f:
+        data = tomllib.load(f)
+    return isinstance(data.get("eval"), list)
+
+
 @pytest.mark.parametrize("config_file", get_config_files(), ids=lambda x: x.as_posix())
 def test_load_configs(config_file: Path):
     """Tests that all config files can be loaded by at least one config class."""
+    if is_eval_config(config_file):
+        pytest.skip("vf-eval TOML files are not prime-rl entrypoint configs")
+
     could_parse = []
     for config_cls in CONFIG_CLASSES:
         try:

diff --git a/uv.lock b/uv.lock
+3 −0		.github/workflows/publish-envs.yaml
+1 −0		environments/ddbc/README.md
+2 −0		environments/ddbc/ddbc/ddbc.py
+45 −6		environments/ddbc/ddbc/open_one.py
+1 −1		environments/ddbc/pyproject.toml
+1 −0		environments/ddbc_rlm/README.md
+2 −0		environments/ddbc_rlm/ddbc_rlm/ddbc_rlm.py
+45 −6		environments/ddbc_rlm/ddbc_rlm/open_one.py
+1 −1		environments/ddbc_rlm/pyproject.toml
+3 −0		environments/deepdive/README.md
+11 −0		environments/deepdive/deepdive/config.py
+4 −0		environments/deepdive/deepdive/deepdive.py
+45 −6		environments/deepdive/deepdive/open_one.py
+1 −1		environments/deepdive/pyproject.toml
+2 −0		environments/deepdive_rlm/README.md
+11 −0		environments/deepdive_rlm/deepdive_rlm/config.py
+4 −0		environments/deepdive_rlm/deepdive_rlm/deepdive_rlm.py
+45 −8		environments/deepdive_rlm/deepdive_rlm/open_one.py
+1 −1		environments/deepdive_rlm/pyproject.toml
+25 −0		environments/general_agent/README.md
+495 −0		environments/general_agent/general_agent/solver/rlm/behavior.py
+63 −0		environments/general_agent/general_agent/solver/rlm/env.py
+150 −0		environments/general_agent/general_agent/solver/rlm/prompts/behavior.md
+1 −1		environments/general_agent/general_agent/solver/rubric.py
+1 −1		environments/general_agent/pyproject.toml
+3 −0		environments/opencode_deepdive/README.md
+11 −1		environments/opencode_deepdive/opencode_deepdive/opencode_deepdive.py
+1 −1		environments/opencode_deepdive/pyproject.toml
+3 −0		environments/rlm_deepdive/README.md
+1 −1		environments/rlm_deepdive/pyproject.toml
+11 −0		environments/rlm_deepdive/rlm_deepdive/rlm_deepdive.py
+64 −0		environments/rlm_swe/README.md
+1 −1		environments/rlm_swe/pyproject.toml
+665 −0		environments/rlm_swe/rlm_swe/behavior.py
+220 −0		environments/rlm_swe/rlm_swe/prompts/behavior.md
+1 −0		environments/rlm_swe/rlm_swe/prompts/venv_hint.md
+75 −0		environments/rlm_swe/rlm_swe/rlm_swe.py
+2 −1		skills/env-sync-push/SKILL.md