Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/01/first-eval-suite.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/01/first-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/02/reverse-text-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/07/advanced-if-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/07/ifeval-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/07/simple-judge-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/07/wiki-search-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/09/math-python-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/09/opencode-harbor.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-mini"
model = "openai/gpt-5.5"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/11/calendar-scheduling-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/12/deep-agents-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
2 changes: 1 addition & 1 deletion configs/12/dspy-rlm-eval.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
18 changes: 9 additions & 9 deletions configs/endpoints.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,21 @@ type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-5.4-nano"
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-5.4-mini"
model = "openai/gpt-5.4-mini"
model = "openai/gpt-5.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-5.4"
model = "openai/gpt-5.4"
model = "openai/gpt-5.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"
Expand All @@ -84,35 +84,35 @@ type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-4.1-nano"
model = "openai/gpt-4.1-nano"
model = "openai/gpt-oss-20b"
url = "https://api.pinference.ai/api/v1"
key = "OPENAI_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-4.1-mini"
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-4.1"
model = "openai/gpt-4.1"
model = "openai/gpt-5.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-5.2"
model = "openai/gpt-5.2"
model = "openai/gpt-5.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"

[[endpoint]]
endpoint_id = "gpt-5.4"
model = "openai/gpt-5.4"
model = "openai/gpt-5.5"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"
Expand All @@ -129,4 +129,4 @@ endpoint_id = "glm-5.1"
model = "z-ai/glm-5.1"
url = "https://api.pinference.ai/api/v1"
key = "PRIME_API_KEY"
type = "openai_chat_completions"
type = "openai_chat_completions"
6 changes: 3 additions & 3 deletions environments/AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ Judges are used for tasks where deterministic evaluation is impractical, and an

```python
judge_rubric = vf.JudgeRubric(
judge_model="gpt-4.1-mini",
judge_model="openai/gpt-4.1-mini",
)

async def judge_correctness(prompt, completion, answer, judge) -> float:
Expand All @@ -322,7 +322,7 @@ For more control, JudgeRubric accepts a custom `judge_prompt` template and expos

```python
judge_rubric = vf.JudgeRubric(
judge_model="gpt-4.1-mini",
judge_model="openai/gpt-4.1-mini",
judge_prompt="""Rate the writing quality of this response from 0-10.
Response: {response}
Score:"""
Expand Down Expand Up @@ -354,7 +354,7 @@ MathRubric includes a `correct_answer` reward function that parses `\boxed{}` an

```python
math_rubric = vf.MathRubric()
judge_rubric = vf.JudgeRubric(judge_model="gpt-4.1-mini")
judge_rubric = vf.JudgeRubric(judge_model="openai/gpt-4.1-mini")
judge_rubric.add_reward_func(judge_correctness, weight=0.5)

rubric = vf.RubricGroup([math_rubric, judge_rubric])
Expand Down
2 changes: 1 addition & 1 deletion environments/gsm8k/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Configure model and sampling:

```bash
prime eval run gsm8k \
-m gpt-4.1-mini \
-m openai/gpt-oss-120b \
-n 20 -r 3 -t 1024 -T 0.7
```

Expand Down
2 changes: 1 addition & 1 deletion environments/opencode_harbor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ prime eval run opencode-harbor
Configure model and sampling:

```bash
prime eval run opencode-harbor -m openai/gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7
prime eval run opencode-harbor -m openai/gpt-oss-120b -n 20 -r 3 -t 1024 -T 0.7
```

Notes:
Expand Down
6 changes: 3 additions & 3 deletions environments/patent_search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ Each package uses the v1 `Taskset` plus default `Harness` pattern: the taskset o
Evaluate published environments:

```bash
prime eval run primeintellect/basic-patent-q-and-a --model openai/gpt-4.1-mini
prime eval run primeintellect/advanced-patent-q-and-a --model openai/gpt-4.1-mini
prime eval run primeintellect/patent-technical-analysis --model openai/gpt-4.1-mini
prime eval run primeintellect/basic-patent-q-and-a --model openai/gpt-oss-120b
prime eval run primeintellect/advanced-patent-q-and-a --model openai/gpt-oss-120b
prime eval run primeintellect/patent-technical-analysis --model openai/gpt-oss-120b
```

Train from the included configs:
Expand Down
2 changes: 1 addition & 1 deletion environments/reverse_text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Configure model and sampling:

```bash
prime eval run reverse-text \
-m openai/gpt-4.1-mini \
-m openai/gpt-oss-120b \
-n 20 -r 3 -t 1024 -T 0.7
```

Expand Down
2 changes: 1 addition & 1 deletion environments/simple_judge/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ Toy single-turn environment for learning LLM judge wiring. Each task asks for a

```bash
prime env install simple-judge
prime eval run simple-judge -m openai/gpt-4.1-mini -n 6 -r 2
prime eval run simple-judge -m openai/gpt-oss-120b -n 6 -r 2
```
6 changes: 3 additions & 3 deletions environments/wiki_search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The index is built lazily — the corpus + collection load runs the first time a

## Reward design

A single judge reward (weight `1.0`): a `gpt-4.1-mini` yes/no on whether the final response is correct and coherent given the ground-truth answer. Incoherent responses score 0 even if the answer is buried inside them.
A single judge reward (weight `1.0`): an `openai/gpt-4.1-mini` yes/no on whether the final response is correct and coherent given the ground-truth answer. Incoherent responses score 0 even if the answer is buried inside them.

The judge call lives in a `@vf.update` handler (`score_with_judge`) that receives the `AsyncOpenAI` client and model name through the same `Toolset.bindings` mechanism the tools use. The reward function (`judge_reward`) just reads `state["judge_score"]` — no factory, no closure-captured client.

Expand All @@ -54,7 +54,7 @@ Configure model and sampling:

```bash
prime eval run wiki-search \
-m openai/gpt-4.1-mini \
-m openai/gpt-oss-120b \
-n 20 -r 3
```

Expand All @@ -70,7 +70,7 @@ All fields live on `WikiSearchTasksetConfig` and can be overridden through the v
| `dataset_split` | str | `"train"` | Split used as the prompt source |
| `max_examples` | int? | `None` | Optional cap on tasks yielded |
| `max_turns` | int | `10` | Per-rollout turn cap |
| `judge_model` | str | `"gpt-4.1-mini"` | Judge model id |
| `judge_model` | str | `"openai/gpt-4.1-mini"` | Judge model id |
| `judge_base_url` | str | OpenAI v1 | Judge endpoint base URL |
| `judge_api_key_var` | str | `"OPENAI_API_KEY"` | Env var holding the judge API key |
| `embed_model` | str | `"text-embedding-3-small"` | Title-embedding model |
Expand Down
2 changes: 1 addition & 1 deletion environments/wordle/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Configure model and sampling:

```bash
prime eval run wordle \
-m openai/gpt-4.1-mini \
-m openai/gpt-oss-120b \
-n 20 -r 3 -t 1024 -T 0.7 \
-a '{"taskset": {"num_train_examples": 100, "num_eval_examples": 20}, "harness": {"max_turns": 6}}'
```
Expand Down
8 changes: 4 additions & 4 deletions guides/01-environments-and-evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Run a small eval:

```bash
prime eval run prime/gsm8k \
-m openai/gpt-5.4-nano \
-m openai/gpt-oss-20b \
-n 10 \
-r 2
```
Expand All @@ -27,7 +27,7 @@ This can also be done with a config file:

```toml
# [configs/01/first-eval.toml](../../configs/01/first-eval.toml)
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down Expand Up @@ -72,7 +72,7 @@ There are several factors to consider when selecting a model:

The same environments you use for evaluating closed frontier models can be used for training your own models on top of an open base model. See [Training with RL](../03-training-with-rl/README.md#choose-a-training-model) for how to connect a training-compatible model to the same environments.

**Cost, speed, and capability.** Start with a cheap, fast model — `openai/gpt-5.4-nano`, `anthropic/claude-haiku-4.5`, or a small open model like `Qwen/Qwen3.5-0.8B` — to confirm the environment and scoring work, then step up when you're iterating on prompts or checking the ceiling. Many evals use OpenAI or Anthropic models: pass a Prime Inference id to `-m` as above, or an alias from [configs/endpoints.toml](../../configs/endpoints.toml) with your own API key. Run `prime inference models` if you want to browse options or compare pricing. If a bigger model doesn't move scores, the bottleneck is probably the environment, not the model.
**Cost, speed, and capability.** Start with a cheap, fast model — `openai/gpt-oss-20b`, `anthropic/claude-haiku-4.5`, or a small open model like `Qwen/Qwen3.5-0.8B` — to confirm the environment and scoring work, then step up when you're iterating on prompts or checking the ceiling. Many evals use OpenAI or Anthropic models: pass a Prime Inference id to `-m` as above, or an alias from [configs/endpoints.toml](../../configs/endpoints.toml) with your own API key. Run `prime inference models` if you want to browse options or compare pricing. If a bigger model doesn't move scores, the bottleneck is probably the environment, not the model.

**Reasoning controls.** Many model families, including `Qwen3.5` / `Qwen3.6`, `Nemotron`, and `gpt-oss`, support thinking mode — extended chain-of-thought before the final answer, toggled via `[sampling].enable_thinking` (or `reasoning_effort` for `gpt-oss`). This helps on multi-step tasks (math, code, logic) but inflates output length and cost. When comparing models, try a few reasoning settings so you see the cost-performance tradeoffs, not just the best-case score.

Expand All @@ -87,7 +87,7 @@ Use [configs/01/first-eval-suite.toml](../../configs/01/first-eval-suite.toml):

```toml
# [configs/01/first-eval-suite.toml](../../configs/01/first-eval-suite.toml)
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand Down
6 changes: 3 additions & 3 deletions guides/02-building-your-first-environment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Run a small eval:

```bash
prime eval run reverse-text \
-m openai/gpt-5.4-nano \
-m openai/gpt-oss-20b \
-n 10 \
-r 2 \
-t 512
Expand All @@ -174,7 +174,7 @@ Or run with a config file:

```toml
# [configs/02/reverse-text-eval.toml](../../configs/02/reverse-text-eval.toml)
model = "openai/gpt-5.4-nano"
model = "openai/gpt-oss-20b"
save_results = true

[[eval]]
Expand All @@ -193,7 +193,7 @@ belong under `taskset`; base harness fields belong under `harness`:

```bash
prime eval run reverse-text \
-m openai/gpt-5.4-nano \
-m openai/gpt-oss-20b \
-a '{"taskset": {"dataset_split": "train[:100]"}, "harness": {"max_turns": 1}}'
```

Expand Down
18 changes: 9 additions & 9 deletions guides/07-judges-and-instruction-following/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ Each task stores a criterion in `info`. `judge_reward` calls an LLM and parses `

```bash
prime env install simple-judge
prime eval run simple-judge -m openai/gpt-4.1-mini -n 6 -r 2
prime eval run simple-judge -m openai/gpt-oss-120b -n 6 -r 2
```

```toml
# [configs/07/simple-judge-eval.toml](../../configs/07/simple-judge-eval.toml)
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand All @@ -41,14 +41,14 @@ configs:

```bash
prime eval run simple-judge \
-m openai/gpt-4.1-mini \
-a '{"taskset": {"judge_model": "openai/gpt-5-mini"}}'
-m openai/gpt-oss-120b \
-a '{"taskset": {"judge_model": "openai/gpt-4.1-mini"}}'
```

```toml
[[eval]]
env_id = "simple-judge"
taskset = { judge_model = "openai/gpt-5-mini", judge_api_key_var = "PRIME_API_KEY" }
taskset = { judge_model = "openai/gpt-4.1-mini", judge_api_key_var = "PRIME_API_KEY" }
```

Call `vf.ensure_keys(...)` in `load_taskset` if the env requires API keys.
Expand All @@ -60,12 +60,12 @@ Implementation: [environments/simple_judge/simple_judge.py](../../environments/s
## Part 2: IFEval

```bash
prime eval run prime/ifeval -m openai/gpt-4.1-mini -n 10 -r 1 -t 1024
prime eval run prime/ifeval -m openai/gpt-oss-120b -n 10 -r 1 -t 1024
```

```toml
# [configs/07/ifeval-eval.toml](../../configs/07/ifeval-eval.toml)
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand All @@ -81,12 +81,12 @@ taskset = { mode = "strict" }
## Part 3: AdvancedIF

```bash
prime eval run will/advanced-if -m openai/gpt-4.1-mini -n 5 -r 1 -t 2048
prime eval run will/advanced-if -m openai/gpt-oss-120b -n 5 -r 1 -t 2048
```

```toml
# [configs/07/advanced-if-eval.toml](../../configs/07/advanced-if-eval.toml)
model = "openai/gpt-4.1-mini"
model = "openai/gpt-oss-120b"
save_results = true

[[eval]]
Expand Down
Loading