PrimeIntellect-ai · xeophon · Jun 1, 2026 · Jun 1, 2026
diff --git a/configs/01/first-eval-suite.toml b/configs/01/first-eval-suite.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/01/first-eval.toml b/configs/01/first-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/02/reverse-text-eval.toml b/configs/02/reverse-text-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/07/advanced-if-eval.toml b/configs/07/advanced-if-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]

diff --git a/configs/07/ifeval-eval.toml b/configs/07/ifeval-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]

diff --git a/configs/07/simple-judge-eval.toml b/configs/07/simple-judge-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]

diff --git a/configs/07/wiki-search-eval.toml b/configs/07/wiki-search-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/09/math-python-eval.toml b/configs/09/math-python-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/09/opencode-harbor.toml b/configs/09/opencode-harbor.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-mini"
+model = "openai/gpt-5.5"
 save_results = true
 
 [[eval]]

diff --git a/configs/11/calendar-scheduling-eval.toml b/configs/11/calendar-scheduling-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/12/deep-agents-eval.toml b/configs/12/deep-agents-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/12/dspy-rlm-eval.toml b/configs/12/dspy-rlm-eval.toml
@@ -1,4 +1,4 @@
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/configs/endpoints.toml b/configs/endpoints.toml
@@ -56,21 +56,21 @@ type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-5.4-nano"
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-5.4-mini"
-model = "openai/gpt-5.4-mini"
+model = "openai/gpt-5.5"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-5.4"
-model = "openai/gpt-5.4"
+model = "openai/gpt-5.5"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
@@ -84,35 +84,35 @@ type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-4.1-nano"
-model = "openai/gpt-4.1-nano"
+model = "openai/gpt-oss-20b"
 url = "https://api.pinference.ai/api/v1"
 key = "OPENAI_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-4.1-mini"
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-4.1"
-model = "openai/gpt-4.1"
+model = "openai/gpt-5.5"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-5.2"
-model = "openai/gpt-5.2"
+model = "openai/gpt-5.5"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
 
 [[endpoint]]
 endpoint_id = "gpt-5.4"
-model = "openai/gpt-5.4"
+model = "openai/gpt-5.5"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
 type = "openai_chat_completions"
@@ -129,4 +129,4 @@ endpoint_id = "glm-5.1"
 model = "z-ai/glm-5.1"
 url = "https://api.pinference.ai/api/v1"
 key = "PRIME_API_KEY"
-type = "openai_chat_completions"
+type = "openai_chat_completions"
diff --git a/environments/AGENTS.md b/environments/AGENTS.md
@@ -306,7 +306,7 @@ Judges are used for tasks where deterministic evaluation is impractical, and an
 
 ```python
 judge_rubric = vf.JudgeRubric(
-    judge_model="gpt-4.1-mini",
+    judge_model="openai/gpt-4.1-mini",
 )
 
 async def judge_correctness(prompt, completion, answer, judge) -> float:
@@ -322,7 +322,7 @@ For more control, JudgeRubric accepts a custom `judge_prompt` template and expos
 
 ```python
 judge_rubric = vf.JudgeRubric(
-    judge_model="gpt-4.1-mini",
+    judge_model="openai/gpt-4.1-mini",
     judge_prompt="""Rate the writing quality of this response from 0-10.
 Response: {response}
 Score:"""
@@ -354,7 +354,7 @@ MathRubric includes a `correct_answer` reward function that parses `\boxed{}` an
 
 ```python
 math_rubric = vf.MathRubric()
-judge_rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini")
+judge_rubric = vf.JudgeRubric(judge_model="openai/gpt-4.1-mini")
 judge_rubric.add_reward_func(judge_correctness, weight=0.5)
 
 rubric = vf.RubricGroup([math_rubric, judge_rubric])

diff --git a/environments/gsm8k/README.md b/environments/gsm8k/README.md
@@ -29,7 +29,7 @@ Configure model and sampling:
 
 ```bash
 prime eval run gsm8k \
-  -m gpt-4.1-mini \
+  -m openai/gpt-oss-120b \
   -n 20 -r 3 -t 1024 -T 0.7
 ```
 

diff --git a/environments/opencode_harbor/README.md b/environments/opencode_harbor/README.md
@@ -24,7 +24,7 @@ prime eval run opencode-harbor
 Configure model and sampling:
 
 ```bash
-prime eval run opencode-harbor -m openai/gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7
+prime eval run opencode-harbor -m openai/gpt-oss-120b -n 20 -r 3 -t 1024 -T 0.7
 ```
 
 Notes:

diff --git a/environments/patent_search/README.md b/environments/patent_search/README.md
@@ -27,9 +27,9 @@ Each package uses the v1 `Taskset` plus default `Harness` pattern: the taskset o
 Evaluate published environments:
 
 ```bash
-prime eval run primeintellect/basic-patent-q-and-a --model openai/gpt-4.1-mini
-prime eval run primeintellect/advanced-patent-q-and-a --model openai/gpt-4.1-mini
-prime eval run primeintellect/patent-technical-analysis --model openai/gpt-4.1-mini
+prime eval run primeintellect/basic-patent-q-and-a --model openai/gpt-oss-120b
+prime eval run primeintellect/advanced-patent-q-and-a --model openai/gpt-oss-120b
+prime eval run primeintellect/patent-technical-analysis --model openai/gpt-oss-120b
 ```
 
 Train from the included configs:

diff --git a/environments/reverse_text/README.md b/environments/reverse_text/README.md
@@ -28,7 +28,7 @@ Configure model and sampling:
 
 ```bash
 prime eval run reverse-text \
-  -m openai/gpt-4.1-mini \
+  -m openai/gpt-oss-120b \
   -n 20 -r 3 -t 1024 -T 0.7
 ```
 

diff --git a/environments/simple_judge/README.md b/environments/simple_judge/README.md
@@ -10,5 +10,5 @@ Toy single-turn environment for learning LLM judge wiring. Each task asks for a
 
 ```bash
 prime env install simple-judge
-prime eval run simple-judge -m openai/gpt-4.1-mini -n 6 -r 2
+prime eval run simple-judge -m openai/gpt-oss-120b -n 6 -r 2
 ```
diff --git a/environments/wiki_search/README.md b/environments/wiki_search/README.md
@@ -32,7 +32,7 @@ The index is built lazily — the corpus + collection load runs the first time a
 
 ## Reward design
 
-A single judge reward (weight `1.0`): a `gpt-4.1-mini` yes/no on whether the final response is correct and coherent given the ground-truth answer. Incoherent responses score 0 even if the answer is buried inside them.
+A single judge reward (weight `1.0`): an `openai/gpt-4.1-mini` yes/no on whether the final response is correct and coherent given the ground-truth answer. Incoherent responses score 0 even if the answer is buried inside them.
 
 The judge call lives in a `@vf.update` handler (`score_with_judge`) that receives the `AsyncOpenAI` client and model name through the same `Toolset.bindings` mechanism the tools use. The reward function (`judge_reward`) just reads `state["judge_score"]` — no factory, no closure-captured client.
 
@@ -54,7 +54,7 @@ Configure model and sampling:
 
 ```bash
 prime eval run wiki-search \
-  -m openai/gpt-4.1-mini \
+  -m openai/gpt-oss-120b \
   -n 20 -r 3
 ```
 
@@ -70,7 +70,7 @@ All fields live on `WikiSearchTasksetConfig` and can be overridden through the v
 | `dataset_split` | str | `"train"` | Split used as the prompt source |
 | `max_examples` | int? | `None` | Optional cap on tasks yielded |
 | `max_turns` | int | `10` | Per-rollout turn cap |
-| `judge_model` | str | `"gpt-4.1-mini"` | Judge model id |
+| `judge_model` | str | `"openai/gpt-4.1-mini"` | Judge model id |
 | `judge_base_url` | str | OpenAI v1 | Judge endpoint base URL |
 | `judge_api_key_var` | str | `"OPENAI_API_KEY"` | Env var holding the judge API key |
 | `embed_model` | str | `"text-embedding-3-small"` | Title-embedding model |

diff --git a/environments/wordle/README.md b/environments/wordle/README.md
@@ -28,7 +28,7 @@ Configure model and sampling:
 
 ```bash
 prime eval run wordle \
-  -m openai/gpt-4.1-mini \
+  -m openai/gpt-oss-120b \
   -n 20 -r 3 -t 1024 -T 0.7 \
   -a '{"taskset": {"num_train_examples": 100, "num_eval_examples": 20}, "harness": {"max_turns": 6}}'
 ```

diff --git a/guides/01-environments-and-evals/README.md b/guides/01-environments-and-evals/README.md
@@ -16,7 +16,7 @@ Run a small eval:
 
 ```bash
 prime eval run prime/gsm8k \
-  -m openai/gpt-5.4-nano \
+  -m openai/gpt-oss-20b \
   -n 10 \
   -r 2
 ```
@@ -27,7 +27,7 @@ This can also be done with a config file:
 
 ```toml
 # [configs/01/first-eval.toml](../../configs/01/first-eval.toml)
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]
@@ -72,7 +72,7 @@ There are several factors to consider when selecting a model:
 
 The same environments you use for evaluating closed frontier models can be used for training your own models on top of an open base model. See [Training with RL](../03-training-with-rl/README.md#choose-a-training-model) for how to connect a training-compatible model to the same environments.
 
-**Cost, speed, and capability.** Start with a cheap, fast model — `openai/gpt-5.4-nano`, `anthropic/claude-haiku-4.5`, or a small open model like `Qwen/Qwen3.5-0.8B` — to confirm the environment and scoring work, then step up when you're iterating on prompts or checking the ceiling. Many evals use OpenAI or Anthropic models: pass a Prime Inference id to `-m` as above, or an alias from [configs/endpoints.toml](../../configs/endpoints.toml) with your own API key. Run `prime inference models` if you want to browse options or compare pricing. If a bigger model doesn't move scores, the bottleneck is probably the environment, not the model.
+**Cost, speed, and capability.** Start with a cheap, fast model — `openai/gpt-oss-20b`, `anthropic/claude-haiku-4.5`, or a small open model like `Qwen/Qwen3.5-0.8B` — to confirm the environment and scoring work, then step up when you're iterating on prompts or checking the ceiling. Many evals use OpenAI or Anthropic models: pass a Prime Inference id to `-m` as above, or an alias from [configs/endpoints.toml](../../configs/endpoints.toml) with your own API key. Run `prime inference models` if you want to browse options or compare pricing. If a bigger model doesn't move scores, the bottleneck is probably the environment, not the model.
 
 **Reasoning controls.** Many model families, including `Qwen3.5` / `Qwen3.6`, `Nemotron`, and `gpt-oss`, support thinking mode — extended chain-of-thought before the final answer, toggled via `[sampling].enable_thinking` (or `reasoning_effort` for `gpt-oss`). This helps on multi-step tasks (math, code, logic) but inflates output length and cost. When comparing models, try a few reasoning settings so you see the cost-performance tradeoffs, not just the best-case score.
 
@@ -87,7 +87,7 @@ Use [configs/01/first-eval-suite.toml](../../configs/01/first-eval-suite.toml):
 
 ```toml
 # [configs/01/first-eval-suite.toml](../../configs/01/first-eval-suite.toml)
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]

diff --git a/guides/02-building-your-first-environment/README.md b/guides/02-building-your-first-environment/README.md
@@ -164,7 +164,7 @@ Run a small eval:
 
 ```bash
 prime eval run reverse-text \
-  -m openai/gpt-5.4-nano \
+  -m openai/gpt-oss-20b \
   -n 10 \
   -r 2 \
   -t 512
@@ -174,7 +174,7 @@ Or run with a config file:
 
 ```toml
 # [configs/02/reverse-text-eval.toml](../../configs/02/reverse-text-eval.toml)
-model = "openai/gpt-5.4-nano"
+model = "openai/gpt-oss-20b"
 save_results = true
 
 [[eval]]
@@ -193,7 +193,7 @@ belong under `taskset`; base harness fields belong under `harness`:
 
 ```bash
 prime eval run reverse-text \
-  -m openai/gpt-5.4-nano \
+  -m openai/gpt-oss-20b \
   -a '{"taskset": {"dataset_split": "train[:100]"}, "harness": {"max_turns": 1}}'
 ```
 

diff --git a/guides/07-judges-and-instruction-following/README.md b/guides/07-judges-and-instruction-following/README.md
@@ -12,12 +12,12 @@ Each task stores a criterion in `info`. `judge_reward` calls an LLM and parses `
 
 ```bash
 prime env install simple-judge
-prime eval run simple-judge -m openai/gpt-4.1-mini -n 6 -r 2
+prime eval run simple-judge -m openai/gpt-oss-120b -n 6 -r 2
 ```
 
 ```toml
 # [configs/07/simple-judge-eval.toml](../../configs/07/simple-judge-eval.toml)
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]
@@ -41,14 +41,14 @@ configs:
 
 ```bash
 prime eval run simple-judge \
-  -m openai/gpt-4.1-mini \
-  -a '{"taskset": {"judge_model": "openai/gpt-5-mini"}}'
+  -m openai/gpt-oss-120b \
+  -a '{"taskset": {"judge_model": "openai/gpt-4.1-mini"}}'
 ```
 
 ```toml
 [[eval]]
 env_id = "simple-judge"
-taskset = { judge_model = "openai/gpt-5-mini", judge_api_key_var = "PRIME_API_KEY" }
+taskset = { judge_model = "openai/gpt-4.1-mini", judge_api_key_var = "PRIME_API_KEY" }
 ```
 
 Call `vf.ensure_keys(...)` in `load_taskset` if the env requires API keys.
@@ -60,12 +60,12 @@ Implementation: [environments/simple_judge/simple_judge.py](../../environments/s
 ## Part 2: IFEval
 
 ```bash
-prime eval run prime/ifeval -m openai/gpt-4.1-mini -n 10 -r 1 -t 1024
+prime eval run prime/ifeval -m openai/gpt-oss-120b -n 10 -r 1 -t 1024
 ```
 
 ```toml
 # [configs/07/ifeval-eval.toml](../../configs/07/ifeval-eval.toml)
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]
@@ -81,12 +81,12 @@ taskset = { mode = "strict" }
 ## Part 3: AdvancedIF
 
 ```bash
-prime eval run will/advanced-if -m openai/gpt-4.1-mini -n 5 -r 1 -t 2048
+prime eval run will/advanced-if -m openai/gpt-oss-120b -n 5 -r 1 -t 2048
 ```
 
 ```toml
 # [configs/07/advanced-if-eval.toml](../../configs/07/advanced-if-eval.toml)
-model = "openai/gpt-4.1-mini"
+model = "openai/gpt-oss-120b"
 save_results = true
 
 [[eval]]
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,7 +24,7 @@ prime eval run opencode-harbor @@
     Configure model and sampling:
     ```bash
-    prime eval run opencode-harbor -m openai/gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7
+    prime eval run opencode-harbor -m openai/gpt-oss-120b -n 20 -r 3 -t 1024 -T 0.7
     ```
     Notes:
@@ Expand Down @@