Skip to content

Commit 71ea1c4

Browse files
committed
Added MCP client to the pipeline
1 parent b1c62f5 commit 71ea1c4

File tree

7 files changed

+632
-318
lines changed

7 files changed

+632
-318
lines changed

eval-app/pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ dependencies = [
2323
"plotly[express]==6.4.0",
2424
"anywidget==0.9.18",
2525
"nbformat==5.10.4",
26-
"langchain==0.3.27",
27-
"langchain-core==0.3.76",
28-
"langchain-openai==0.3.33",
26+
"langchain>=1.2.8",
27+
"langchain-core>=1.2.8",
28+
"langchain-openai>=1.1.7",
29+
"langchain-mcp-adapters>=0.2.1",
2930
]

eval-app/src/evaluator/data/toxpipe_eval_info/config/raw/providers.yaml

Lines changed: 64 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,70 @@ base-model:
6262
id: mistral-large-2
6363
label: Mistral Large 2
6464
func: queryLLM
65+
mcp:
66+
providers:
67+
- config:
68+
temperature: 0
69+
id: azure-o3
70+
label: o3
71+
func: queryToxPipeMCP
72+
- config:
73+
temperature: 0
74+
reasoning_effort: high
75+
id: azure-gpt-5
76+
label: GPT-5 (high reasoning)
77+
func: queryToxPipeMCP
78+
- config:
79+
temperature: 0
80+
reasoning_effort: low
81+
id: azure-gpt-5
82+
label: GPT-5 (low reasoning)
83+
func: queryToxPipeMCP
84+
- config:
85+
temperature: 0
86+
id: azure-gpt-5-nano
87+
label: GPT-5 Nano
88+
func: queryToxPipeMCP
89+
- config:
90+
temperature: 0
91+
id: azure-gpt-4o
92+
label: GPT-4o
93+
func: queryToxPipeMCP
94+
- config:
95+
temperature: 0
96+
id: claude-4.5-haiku
97+
label: Claude 4.5 Haiku
98+
func: queryToxPipeMCP
99+
- config:
100+
temperature: 0
101+
id: claude-4.5-sonnet
102+
label: Claude 4.5 Sonnet
103+
func: queryToxPipeMCP
104+
- config:
105+
temperature: 0
106+
id: claude-3-7-sonnet
107+
label: Claude 3.7 Sonnet
108+
func: queryToxPipeMCP
109+
- config:
110+
temperature: 0
111+
id: gemini-2.5-pro
112+
label: Gemini 2.5 Pro
113+
func: queryToxPipeMCP
114+
- config:
115+
temperature: 0
116+
id: gemini-2.5-flash
117+
label: Gemini 2.5 Flash
118+
func: queryToxPipeMCP
119+
- config:
120+
temperature: 0
121+
id: llama4-scout-17b-instruct
122+
label: Llama 4 Scout 17B (Instruct)
123+
func: queryToxPipeMCP
124+
- config:
125+
temperature: 0
126+
id: mistral-large-2
127+
label: Mistral Large 2
128+
func: queryToxPipeMCP
65129
rag:
66130
providers:
67131
- config:
@@ -138,82 +202,6 @@ rag:
138202
id: ToxPipeRAG
139203
label: Toxpipe (RAG) [Mistral Large 2]
140204
func: queryToxPipeRAG
141-
mcp:
142-
providers:
143-
- config:
144-
model: azure-o3
145-
temp: 0
146-
id: ToxPipeMCP
147-
label: Toxpipe (MCP) [o3]
148-
func: queryToxPipeMCP
149-
- config:
150-
model: azure-gpt-5
151-
temp: 0
152-
reasoning_effort: high
153-
id: ToxPipeMCP
154-
label: Toxpipe (MCP) [GPT-5 High Reasoning]
155-
func: queryToxPipeMCP
156-
- config:
157-
model: azure-gpt-5
158-
temp: 0
159-
reasoning_effort: low
160-
id: ToxPipeMCP
161-
label: Toxpipe (MCP) [GPT-5 Low Reasoning]
162-
func: queryToxPipeMCP
163-
- config:
164-
model: azure-gpt-5-nano
165-
temp: 0
166-
id: ToxPipeMCP
167-
label: Toxpipe (MCP) [GPT-5 Nano]
168-
func: queryToxPipeMCP
169-
- config:
170-
model: azure-gpt-4o
171-
temp: 0
172-
id: ToxPipeMCP
173-
label: Toxpipe (MCP) [GPT-4o]
174-
func: queryToxPipeMCP
175-
- config:
176-
model: claude-4.5-haiku
177-
temp: 0
178-
id: ToxPipeMCP
179-
label: Toxpipe (MCP) [Claude 4.5 Haiku]
180-
func: queryToxPipeMCP
181-
- config:
182-
model: claude-4.5-sonnet
183-
temp: 0
184-
id: ToxPipeMCP
185-
label: Toxpipe (MCP) [Claude 4.5 Sonnet]
186-
func: queryToxPipeMCP
187-
- config:
188-
model: claude-3-7-sonnet
189-
temp: 0
190-
id: ToxPipeMCP
191-
label: Toxpipe (MCP) [Claude 3.7 Sonnet]
192-
func: queryToxPipeMCP
193-
- config:
194-
model: gemini-2.5-pro
195-
temp: 0
196-
id: ToxPipeMCP
197-
label: Toxpipe (MCP) [Gemini 2.5 Pro]
198-
func: queryToxPipeMCP
199-
- config:
200-
model: gemini-2.5-flash
201-
temp: 0
202-
id: ToxPipeMCP
203-
label: Toxpipe (MCP) [Gemini 2.5 Flash]
204-
func: queryToxPipeMCP
205-
- config:
206-
model: llama4-scout-17b-instruct
207-
temp: 0
208-
id: ToxPipeMCP
209-
label: Toxpipe (MCP) [Llama 4 Scout 17B (Instruct)]
210-
func: queryToxPipeMCP
211-
- config:
212-
model: mistral-large-2
213-
temp: 0
214-
id: ToxPipeMCP
215-
label: Toxpipe (MCP) [Mistral Large 2]
216-
func: queryToxPipeMCP
217205
agentic:
218206
providers:
219207
- config:

eval-app/src/evaluator/notebooks/create_bulk_tests.ipynb

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": 14,
66
"metadata": {},
77
"outputs": [],
88
"source": [
99
"import pandas as pd\n",
1010
"import yaml\n",
1111
"import importlib\n",
12-
"utils = importlib.import_module(\"eval-app.src.utils\")\n",
12+
"utils = importlib.import_module(\"src.utils\")\n",
1313
"\n",
1414
"class MyDumper(yaml.Dumper):\n",
1515
"\n",
@@ -19,7 +19,7 @@
1919
},
2020
{
2121
"cell_type": "code",
22-
"execution_count": 2,
22+
"execution_count": 15,
2323
"metadata": {},
2424
"outputs": [],
2525
"source": [
@@ -44,7 +44,7 @@
4444
},
4545
{
4646
"cell_type": "code",
47-
"execution_count": 3,
47+
"execution_count": 16,
4848
"metadata": {},
4949
"outputs": [],
5050
"source": [
@@ -81,7 +81,7 @@
8181
},
8282
{
8383
"cell_type": "code",
84-
"execution_count": 5,
84+
"execution_count": 17,
8585
"metadata": {},
8686
"outputs": [],
8787
"source": [
@@ -144,7 +144,7 @@
144144
},
145145
{
146146
"cell_type": "code",
147-
"execution_count": 6,
147+
"execution_count": 18,
148148
"metadata": {},
149149
"outputs": [
150150
{
@@ -433,7 +433,7 @@
433433
"[5 rows x 22 columns]"
434434
]
435435
},
436-
"execution_count": 6,
436+
"execution_count": 18,
437437
"metadata": {},
438438
"output_type": "execute_result"
439439
}
@@ -446,7 +446,7 @@
446446
},
447447
{
448448
"cell_type": "code",
449-
"execution_count": 7,
449+
"execution_count": 19,
450450
"metadata": {},
451451
"outputs": [],
452452
"source": [
@@ -462,7 +462,7 @@
462462
},
463463
{
464464
"cell_type": "code",
465-
"execution_count": 8,
465+
"execution_count": 20,
466466
"metadata": {},
467467
"outputs": [
468468
{
@@ -744,7 +744,7 @@
744744
"[5 rows x 22 columns]"
745745
]
746746
},
747-
"execution_count": 8,
747+
"execution_count": 20,
748748
"metadata": {},
749749
"output_type": "execute_result"
750750
}
@@ -757,7 +757,7 @@
757757
},
758758
{
759759
"cell_type": "code",
760-
"execution_count": 9,
760+
"execution_count": 21,
761761
"metadata": {},
762762
"outputs": [],
763763
"source": [
@@ -773,7 +773,7 @@
773773
},
774774
{
775775
"cell_type": "code",
776-
"execution_count": 10,
776+
"execution_count": 22,
777777
"metadata": {},
778778
"outputs": [
779779
{
@@ -966,7 +966,7 @@
966966
"[474 rows x 6 columns]"
967967
]
968968
},
969-
"execution_count": 10,
969+
"execution_count": 22,
970970
"metadata": {},
971971
"output_type": "execute_result"
972972
}
@@ -979,7 +979,7 @@
979979
},
980980
{
981981
"cell_type": "code",
982-
"execution_count": 11,
982+
"execution_count": 23,
983983
"metadata": {},
984984
"outputs": [
985985
{
@@ -1016,7 +1016,7 @@
10161016
"Index: []"
10171017
]
10181018
},
1019-
"execution_count": 11,
1019+
"execution_count": 23,
10201020
"metadata": {},
10211021
"output_type": "execute_result"
10221022
}
@@ -1071,7 +1071,7 @@
10711071
},
10721072
{
10731073
"cell_type": "code",
1074-
"execution_count": 13,
1074+
"execution_count": 24,
10751075
"metadata": {},
10761076
"outputs": [
10771077
{
@@ -1085,7 +1085,7 @@
10851085
"name": "stderr",
10861086
"output_type": "stream",
10871087
"text": [
1088-
"100%|██████████| 4/4 [00:00<00:00, 4.90it/s]\n"
1088+
"100%|██████████| 4/4 [00:02<00:00, 1.84it/s]\n"
10891089
]
10901090
},
10911091
{
@@ -1099,7 +1099,7 @@
10991099
"name": "stderr",
11001100
"output_type": "stream",
11011101
"text": [
1102-
"100%|██████████| 4/4 [00:00<00:00, 6.31it/s]\n"
1102+
"100%|██████████| 4/4 [00:02<00:00, 1.70it/s]\n"
11031103
]
11041104
},
11051105
{
@@ -1113,7 +1113,7 @@
11131113
"name": "stderr",
11141114
"output_type": "stream",
11151115
"text": [
1116-
"100%|██████████| 4/4 [00:00<00:00, 6.58it/s]\n"
1116+
"100%|██████████| 4/4 [00:02<00:00, 1.98it/s]\n"
11171117
]
11181118
},
11191119
{
@@ -1127,7 +1127,7 @@
11271127
"name": "stderr",
11281128
"output_type": "stream",
11291129
"text": [
1130-
"100%|██████████| 4/4 [00:00<00:00, 6.79it/s]\n"
1130+
"100%|██████████| 4/4 [00:01<00:00, 2.03it/s]\n"
11311131
]
11321132
},
11331133
{
@@ -1141,15 +1141,15 @@
11411141
"name": "stderr",
11421142
"output_type": "stream",
11431143
"text": [
1144-
"100%|██████████| 4/4 [00:00<00:00, 6.14it/s]\n"
1144+
"100%|██████████| 4/4 [00:02<00:00, 1.64it/s]\n"
11451145
]
11461146
}
11471147
],
11481148
"source": [
11491149
"import yaml\n",
11501150
"import tqdm\n",
11511151
"import importlib\n",
1152-
"mdb = importlib.import_module(\"eval-app.src.evaluator.src.evaluation.db\")\n",
1152+
"mdb = importlib.import_module(\"src.evaluator.src.evaluation.db\")\n",
11531153
"\n",
11541154
"def loadYML(file_path):\n",
11551155
" data = None\n",

0 commit comments

Comments
 (0)