Remove unsupported models from docs (#34)

steven10a · web-flow · commit a4cae42b1b77 · 2025-11-04T10:34:21.000-08:00
* Remove nano from docs, examples, benchmarking
* Updating graphs
diff --git a/docs/benchmarking/NSFW_roc_curve.png b/docs/benchmarking/NSFW_roc_curve.png
diff --git a/docs/benchmarking/alignment_roc_curves.png b/docs/benchmarking/alignment_roc_curves.png
diff --git a/docs/benchmarking/hallucination_detection_roc_curves.png b/docs/benchmarking/hallucination_detection_roc_curves.png
diff --git a/docs/benchmarking/jailbreak_roc_curve.png b/docs/benchmarking/jailbreak_roc_curve.png
diff --git a/docs/evals.md b/docs/evals.md
@@ -11,7 +11,7 @@ npm run eval -- --config-path guardrails_config.json --dataset-path data.jsonl
 
 ### Benchmark Mode
 ```bash
-npm run eval -- --config-path guardrails_config.json --dataset-path data.jsonl --mode benchmark --models gpt-5 gpt-5-mini gpt-5-nano
+npm run eval -- --config-path guardrails_config.json --dataset-path data.jsonl --mode benchmark --models gpt-5 gpt-5-mini gpt-4.1-mini
 ```
 
 ## Dependencies
@@ -160,4 +160,4 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
 ## Next Steps
 
 - See the [API Reference](./ref/eval/guardrail_evals.md) for detailed documentation
-- Use [Wizard UI](https://guardrails.openai.com/) for configuring guardrails without code
+- Use [Wizard UI](https://guardrails.openai.com/) for configuring guardrails without code
diff --git a/docs/ref/checks/hallucination_detection.md b/docs/ref/checks/hallucination_detection.md
@@ -176,10 +176,8 @@ The statements cover various types of factual claims including:
 |--------------|---------|-------------|-------------|-------------|
 | gpt-5         | 0.854   | 0.732       | 0.686       | 0.670       |
 | gpt-5-mini    | 0.934   | 0.813       | 0.813       | 0.770       |
-| gpt-5-nano    | 0.566   | 0.540       | 0.540       | 0.533       |
 | gpt-4.1       | 0.870   | 0.785       | 0.785       | 0.785       |
 | gpt-4.1-mini (default) | 0.876   | 0.806       | 0.789       | 0.789       |
-| gpt-4.1-nano  | 0.537   | 0.526       | 0.526       | 0.526       |
 
 **Notes:**
 - ROC AUC: Area under the ROC curve (higher is better)
@@ -193,10 +191,8 @@ The following table shows latency measurements for each model using the hallucin
 |--------------|--------------|--------------|
 | gpt-5         | 34,135       | 525,854      |
 | gpt-5-mini    | 23,013       | 59,316       |
-| gpt-5-nano    | 17,079       | 26,317       |
 | gpt-4.1       | 7,126        | 33,464       |
 | gpt-4.1-mini (default) | 7,069        | 43,174       |
-| gpt-4.1-nano  | 4,809        | 6,869        |
 
 - **TTC P50**: Median time to completion (50% of requests complete within this time)
 - **TTC P95**: 95th percentile time to completion (95% of requests complete within this time)
@@ -218,10 +214,8 @@ In addition to the above evaluations which use a 3 MB sized vector store, the ha
 |--------------|---------------------|----------------------|---------------------|---------------------------|
 | gpt-5         | 28,762 / 396,472    | 34,135 / 525,854     | 37,104 / 75,684     | 40,909 / 645,025          |
 | gpt-5-mini    | 19,240 / 39,526     | 23,013 / 59,316      | 24,217 / 65,904     | 37,314 / 118,564          |
-| gpt-5-nano    | 13,436 / 22,032     | 17,079 / 26,317      | 17,843 / 35,639     | 21,724 / 37,062           |
 | gpt-4.1       | 7,437 / 15,721      | 7,126 / 33,464       | 6,993 / 30,315      | 6,688 / 127,481           |
 | gpt-4.1-mini (default) | 6,661 / 14,827      | 7,069 / 43,174       | 7,032 / 46,354      | 7,374 / 37,769            |
-| gpt-4.1-nano  | 4,296 / 6,378       | 4,809 / 6,869        | 4,171 / 6,609       | 4,650 / 6,201             |
 
 - **Vector store size impact varies by model**: GPT-4.1 series shows minimal latency impact across vector store sizes, while GPT-5 series shows significant increases.
 
@@ -241,10 +235,6 @@ In addition to the above evaluations which use a 3 MB sized vector store, the ha
 | | Medium (3 MB) | 0.934 | 0.813 | 0.813 | 0.770 |
 | | Large (11 MB) | 0.919 | 0.817 | 0.817 | 0.817 |
 | | Extra Large (105 MB) | 0.909 | 0.793 | 0.793 | 0.711 |
-| **gpt-5-nano** | Small (1 MB) | 0.590 | 0.547 | 0.545 | 0.536 |
-| | Medium (3 MB) | 0.566 | 0.540 | 0.540 | 0.533 |
-| | Large (11 MB) | 0.564 | 0.534 | 0.532 | 0.507 |
-| | Extra Large (105 MB) | 0.603 | 0.570 | 0.558 | 0.550 |
 | **gpt-4.1** | Small (1 MB) | 0.907 | 0.839 | 0.839 | 0.839 |
 | | Medium (3 MB) | 0.870 | 0.785 | 0.785 | 0.785 |
 | | Large (11 MB) | 0.846 | 0.753 | 0.753 | 0.753 |
@@ -253,15 +243,11 @@ In addition to the above evaluations which use a 3 MB sized vector store, the ha
 | | Medium (3 MB) | 0.876 | 0.806 | 0.789 | 0.789 |
 | | Large (11 MB) | 0.862 | 0.791 | 0.757 | 0.757 |
 | | Extra Large (105 MB) | 0.802 | 0.722 | 0.722 | 0.722 |
-| **gpt-4.1-nano** | Small (1 MB) | 0.605 | 0.528 | 0.528 | 0.528 |
-| | Medium (3 MB) | 0.537 | 0.526 | 0.526 | 0.526 |
-| | Large (11 MB) | 0.618 | 0.531 | 0.531 | 0.531 |
-| | Extra Large (105 MB) | 0.636 | 0.528 | 0.528 | 0.528 |
 
 **Key Insights:**
 
 - **Best Performance**: gpt-5-mini consistently achieves the highest ROC AUC scores across all vector store sizes (0.909-0.939)
-- **Best Latency**: gpt-4.1-nano shows the most consistent and lowest latency across all scales (4,171-4,809ms P50) but shows poor performance
+- **Best Latency**: gpt-4.1-mini shows the most consistent and lowest latency across all scales (6,661-7,374ms P50) while maintaining solid accuracy
 - **Most Stable**: gpt-4.1-mini (default) maintains relatively stable performance across vector store sizes with good accuracy-latency balance
 - **Scale Sensitivity**: gpt-5 shows the most variability in performance across vector store sizes, with performance dropping significantly at larger scales
 - **Performance vs Scale**: Most models show decreasing performance as vector store size increases, with gpt-5-mini being the most resilient
@@ -271,4 +257,4 @@ In addition to the above evaluations which use a 3 MB sized vector store, the ha
 - **Signal-to-noise ratio degradation**: Larger vector stores contain more irrelevant documents that may not be relevant to the specific factual claims being validated
 - **Semantic search limitations**: File search retrieves semantically similar documents, but with a large diverse knowledge source, these may not always be factually relevant
 - **Document quality matters more than quantity**: The relevance and accuracy of documents is more important than the total number of documents
-- **Performance plateaus**: Beyond a certain size (11 MB), the performance impact becomes less severe
+- **Performance plateaus**: Beyond a certain size (11 MB), the performance impact becomes less severe
diff --git a/docs/ref/checks/jailbreak.md b/docs/ref/checks/jailbreak.md
@@ -95,21 +95,17 @@ This benchmark evaluates model performance on a diverse set of prompts:
 |--------------|---------|-------------|-------------|-------------|-----------------|
 | gpt-5         | 0.979   | 0.973       | 0.970       | 0.970       | 0.733           |
 | gpt-5-mini    | 0.954   | 0.990       | 0.900       | 0.900       | 0.768           |
-| gpt-5-nano    | 0.962   | 0.973       | 0.967       | 0.965       | 0.048           |
 | gpt-4.1       | 0.990   | 1.000       | 1.000       | 0.984       | 0.946           |
 | gpt-4.1-mini (default) | 0.982   | 0.992       | 0.992       | 0.954       | 0.444           |
-| gpt-4.1-nano  | 0.934   | 0.924       | 0.924       | 0.848       | 0.000           |
 
 #### Latency Performance
 
 | Model         | TTC P50 (ms) | TTC P95 (ms) |
 |--------------|--------------|--------------|
 | gpt-5         | 4,569        | 7,256        |
 | gpt-5-mini    | 5,019        | 9,212        |
-| gpt-5-nano    | 4,702        | 6,739        |
 | gpt-4.1       | 841          | 1,861        |
 | gpt-4.1-mini  | 749          | 1,291        |
-| gpt-4.1-nano  | 683          | 890          |
 
 **Notes:**
 
diff --git a/docs/ref/checks/nsfw.md b/docs/ref/checks/nsfw.md
@@ -84,10 +84,8 @@ This benchmark evaluates model performance on a balanced set of social media pos
 |--------------|---------|-------------|-------------|-------------|-----------------|
 | gpt-5        | 0.9532  | 0.9195      | 0.9096      | 0.9068      | 0.0339          |
 | gpt-5-mini   | 0.9629  | 0.9321      | 0.9168      | 0.9149      | 0.0998          |
-| gpt-5-nano   | 0.9600  | 0.9297      | 0.9216      | 0.9175      | 0.1078          |
 | gpt-4.1      | 0.9603  | 0.9312      | 0.9249      | 0.9192      | 0.0439          |
 | gpt-4.1-mini (default) | 0.9520  | 0.9180      | 0.9130      | 0.9049      | 0.0459          |
-| gpt-4.1-nano | 0.9502  | 0.9262      | 0.9094      | 0.9043      | 0.0379          |
 
 **Notes:**
 
diff --git a/docs/ref/checks/prompt_injection_detection.md b/docs/ref/checks/prompt_injection_detection.md
@@ -115,10 +115,8 @@ This benchmark evaluates model performance on agent conversation traces:
 |---------------|---------|-------------|-------------|-------------|-----------------|
 | gpt-5         | 0.9931  | 0.9992      | 0.9992      | 0.9992      | 0.5845          |
 | gpt-5-mini    | 0.9536  | 0.9951      | 0.9951      | 0.9951      | 0.0000          |
-| gpt-5-nano    | 0.9283  | 0.9913      | 0.9913      | 0.9717      | 0.0350          |
 | gpt-4.1       | 0.9794  | 0.9973      | 0.9973      | 0.9973      | 0.0000          |
 | gpt-4.1-mini (default) | 0.9865  | 0.9986      | 0.9986      | 0.9986      | 0.0000          |
-| gpt-4.1-nano  | 0.9142  | 0.9948      | 0.9948      | 0.9387      | 0.0000          |
 
 **Notes:**
 
@@ -130,12 +128,10 @@ This benchmark evaluates model performance on agent conversation traces:
 
 | Model         | TTC P50 (ms) | TTC P95 (ms) |
 |---------------|--------------|--------------|
-| gpt-4.1-nano  | 1,159        | 2,534        |
 | gpt-4.1-mini (default)  | 1,481        | 2,563        |
 | gpt-4.1       | 1,742        | 2,296        |
 | gpt-5         | 3,994        | 6,654        |
 | gpt-5-mini    | 5,895        | 9,031        |
-| gpt-5-nano    | 5,911        | 10,134       |
 
 - **TTC P50**: Median time to completion (50% of requests complete within this time)
 - **TTC P95**: 95th percentile time to completion (95% of requests complete within this time)
diff --git a/examples/basic/agents_sdk.ts b/examples/basic/agents_sdk.ts
@@ -34,7 +34,7 @@ const PIPELINE_CONFIG = {
       {
         name: 'Custom Prompt Check',
         config: {
-          model: 'gpt-4.1-nano-2025-04-14',
+          model: 'gpt-4.1-mini-2025-04-14',
           confidence_threshold: 0.7,
           system_prompt_details: 'Check if the text contains any math problems.',
         },
diff --git a/examples/basic/hello_world.ts b/examples/basic/hello_world.ts
@@ -31,7 +31,7 @@ const PIPELINE_CONFIG = {
       {
         name: 'Custom Prompt Check',
         config: {
-          model: 'gpt-4.1-nano',
+          model: 'gpt-4.1-mini',
           confidence_threshold: 0.7,
           system_prompt_details: 'Check if the text contains any math problems.',
         },
@@ -67,7 +67,7 @@ async function processInput(
   // Use the new GuardrailsOpenAI - it handles all guardrail validation automatically
   const response = await guardrailsClient.guardrails.responses.create({
     input: userInput,
-    model: 'gpt-4.1-nano',
+    model: 'gpt-4.1-mini',
     previous_response_id: responseId,
   });
 
diff --git a/examples/basic/multiturn_with_prompt_injection_detection.ts b/examples/basic/multiturn_with_prompt_injection_detection.ts
@@ -292,7 +292,7 @@ async function main(malicious: boolean = false): Promise<void> {
 
       try {
         response = await client.guardrails.responses.create({
-          model: 'gpt-4.1-nano',
+          model: 'gpt-4.1-mini',
           tools: tools,
           input: messages.concat(userMessage),
         });
@@ -388,7 +388,7 @@ async function main(malicious: boolean = false): Promise<void> {
         console.log(`🔄 Making final API call...`);
         try {
           const response = await client.guardrails.responses.create({
-            model: 'gpt-4.1-nano',
+            model: 'gpt-4.1-mini',
             tools: tools,
             input: messages.concat(assistantOutputs, toolMessages),
           });
diff --git a/examples/basic/streaming.ts b/examples/basic/streaming.ts
@@ -28,7 +28,7 @@ const PIPELINE_CONFIG = {
       {
         name: 'Custom Prompt Check',
         config: {
-          model: 'gpt-4.1-nano',
+          model: 'gpt-4.1-mini',
           confidence_threshold: 0.7,
           system_prompt_details: 'Check if the text contains any math problems.',
         },
@@ -68,7 +68,7 @@ async function processInput(
     // including pre-flight, input, and output stages, plus the LLM call
     const stream = await guardrailsClient.guardrails.responses.create({
       input: userInput,
-      model: 'gpt-4.1-nano',
+      model: 'gpt-4.1-mini',
       previous_response_id: responseId,
       stream: true,
     });
diff --git a/examples/basic/suppress_tripwire.ts b/examples/basic/suppress_tripwire.ts
@@ -15,7 +15,7 @@ const PIPELINE_CONFIG: Record<string, any> = {
       {
         name: 'Custom Prompt Check',
         config: {
-          model: 'gpt-4.1-nano-2025-04-14',
+          model: 'gpt-4.1-mini-2025-04-14',
           confidence_threshold: 0.7,
           system_prompt_details: 'Check if the text contains any math problems.',
         },
@@ -36,7 +36,7 @@ async function processInput(
     // Use GuardrailsClient with suppressTripwire=true
     const response = await guardrailsClient.guardrails.responses.create({
       input: userInput,
-      model: 'gpt-4.1-nano-2025-04-14',
+      model: 'gpt-4.1-mini-2025-04-14',
       previous_response_id: responseId,
       suppressTripwire: true,
     });

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ const PIPELINE_CONFIG = {`
`34`	`34`	`{`
`35`	`35`	`name: 'Custom Prompt Check',`
`36`	`36`	`config: {`
`37`		`- model: 'gpt-4.1-nano-2025-04-14',`
	`37`	`+ model: 'gpt-4.1-mini-2025-04-14',`
`38`	`38`	`confidence_threshold: 0.7,`
`39`	`39`	`system_prompt_details: 'Check if the text contains any math problems.',`
`40`	`40`	`},`