Skip to content

Commit b1c62f5

Browse files
committed
Added MCP Agent to the pipeline
1 parent 555432b commit b1c62f5

File tree

3 files changed

+114
-22
lines changed

3 files changed

+114
-22
lines changed

eval-app/src/evaluator/data/toxpipe_eval_info/config/raw/providers.yaml

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,82 @@ rag:
138138
id: ToxPipeRAG
139139
label: Toxpipe (RAG) [Mistral Large 2]
140140
func: queryToxPipeRAG
141+
mcp:
142+
providers:
143+
- config:
144+
model: azure-o3
145+
temp: 0
146+
id: ToxPipeMCP
147+
label: Toxpipe (MCP) [o3]
148+
func: queryToxPipeMCP
149+
- config:
150+
model: azure-gpt-5
151+
temp: 0
152+
reasoning_effort: high
153+
id: ToxPipeMCP
154+
label: Toxpipe (MCP) [GPT-5 High Reasoning]
155+
func: queryToxPipeMCP
156+
- config:
157+
model: azure-gpt-5
158+
temp: 0
159+
reasoning_effort: low
160+
id: ToxPipeMCP
161+
label: Toxpipe (MCP) [GPT-5 Low Reasoning]
162+
func: queryToxPipeMCP
163+
- config:
164+
model: azure-gpt-5-nano
165+
temp: 0
166+
id: ToxPipeMCP
167+
label: Toxpipe (MCP) [GPT-5 Nano]
168+
func: queryToxPipeMCP
169+
- config:
170+
model: azure-gpt-4o
171+
temp: 0
172+
id: ToxPipeMCP
173+
label: Toxpipe (MCP) [GPT-4o]
174+
func: queryToxPipeMCP
175+
- config:
176+
model: claude-4.5-haiku
177+
temp: 0
178+
id: ToxPipeMCP
179+
label: Toxpipe (MCP) [Claude 4.5 Haiku]
180+
func: queryToxPipeMCP
181+
- config:
182+
model: claude-4.5-sonnet
183+
temp: 0
184+
id: ToxPipeMCP
185+
label: Toxpipe (MCP) [Claude 4.5 Sonnet]
186+
func: queryToxPipeMCP
187+
- config:
188+
model: claude-3-7-sonnet
189+
temp: 0
190+
id: ToxPipeMCP
191+
label: Toxpipe (MCP) [Claude 3.7 Sonnet]
192+
func: queryToxPipeMCP
193+
- config:
194+
model: gemini-2.5-pro
195+
temp: 0
196+
id: ToxPipeMCP
197+
label: Toxpipe (MCP) [Gemini 2.5 Pro]
198+
func: queryToxPipeMCP
199+
- config:
200+
model: gemini-2.5-flash
201+
temp: 0
202+
id: ToxPipeMCP
203+
label: Toxpipe (MCP) [Gemini 2.5 Flash]
204+
func: queryToxPipeMCP
205+
- config:
206+
model: llama4-scout-17b-instruct
207+
temp: 0
208+
id: ToxPipeMCP
209+
label: Toxpipe (MCP) [Llama 4 Scout 17B (Instruct)]
210+
func: queryToxPipeMCP
211+
- config:
212+
model: mistral-large-2
213+
temp: 0
214+
id: ToxPipeMCP
215+
label: Toxpipe (MCP) [Mistral Large 2]
216+
func: queryToxPipeMCP
141217
agentic:
142218
providers:
143219
- config:

eval-app/src/evaluator/notebooks/create_bulk_tests.ipynb

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"import pandas as pd\n",
1010
"import yaml\n",
1111
"import importlib\n",
12-
"utils = importlib.import_module(\"src.utils\")\n",
12+
"utils = importlib.import_module(\"eval-app.src.utils\")\n",
1313
"\n",
1414
"class MyDumper(yaml.Dumper):\n",
1515
"\n",
@@ -81,7 +81,7 @@
8181
},
8282
{
8383
"cell_type": "code",
84-
"execution_count": 4,
84+
"execution_count": 5,
8585
"metadata": {},
8686
"outputs": [],
8787
"source": [
@@ -144,7 +144,7 @@
144144
},
145145
{
146146
"cell_type": "code",
147-
"execution_count": 5,
147+
"execution_count": 6,
148148
"metadata": {},
149149
"outputs": [
150150
{
@@ -433,7 +433,7 @@
433433
"[5 rows x 22 columns]"
434434
]
435435
},
436-
"execution_count": 5,
436+
"execution_count": 6,
437437
"metadata": {},
438438
"output_type": "execute_result"
439439
}
@@ -446,7 +446,7 @@
446446
},
447447
{
448448
"cell_type": "code",
449-
"execution_count": 6,
449+
"execution_count": 7,
450450
"metadata": {},
451451
"outputs": [],
452452
"source": [
@@ -462,7 +462,7 @@
462462
},
463463
{
464464
"cell_type": "code",
465-
"execution_count": 7,
465+
"execution_count": 8,
466466
"metadata": {},
467467
"outputs": [
468468
{
@@ -744,7 +744,7 @@
744744
"[5 rows x 22 columns]"
745745
]
746746
},
747-
"execution_count": 7,
747+
"execution_count": 8,
748748
"metadata": {},
749749
"output_type": "execute_result"
750750
}
@@ -757,7 +757,7 @@
757757
},
758758
{
759759
"cell_type": "code",
760-
"execution_count": 8,
760+
"execution_count": 9,
761761
"metadata": {},
762762
"outputs": [],
763763
"source": [
@@ -773,7 +773,7 @@
773773
},
774774
{
775775
"cell_type": "code",
776-
"execution_count": 9,
776+
"execution_count": 10,
777777
"metadata": {},
778778
"outputs": [
779779
{
@@ -966,7 +966,7 @@
966966
"[474 rows x 6 columns]"
967967
]
968968
},
969-
"execution_count": 9,
969+
"execution_count": 10,
970970
"metadata": {},
971971
"output_type": "execute_result"
972972
}
@@ -979,7 +979,7 @@
979979
},
980980
{
981981
"cell_type": "code",
982-
"execution_count": 10,
982+
"execution_count": 11,
983983
"metadata": {},
984984
"outputs": [
985985
{
@@ -1016,7 +1016,7 @@
10161016
"Index: []"
10171017
]
10181018
},
1019-
"execution_count": 10,
1019+
"execution_count": 11,
10201020
"metadata": {},
10211021
"output_type": "execute_result"
10221022
}
@@ -1071,7 +1071,7 @@
10711071
},
10721072
{
10731073
"cell_type": "code",
1074-
"execution_count": 11,
1074+
"execution_count": 13,
10751075
"metadata": {},
10761076
"outputs": [
10771077
{
@@ -1085,7 +1085,7 @@
10851085
"name": "stderr",
10861086
"output_type": "stream",
10871087
"text": [
1088-
"100%|██████████| 3/3 [00:02<00:00, 1.48it/s]\n"
1088+
"100%|██████████| 4/4 [00:00<00:00, 4.90it/s]\n"
10891089
]
10901090
},
10911091
{
@@ -1099,7 +1099,7 @@
10991099
"name": "stderr",
11001100
"output_type": "stream",
11011101
"text": [
1102-
"100%|██████████| 3/3 [00:01<00:00, 1.84it/s]\n"
1102+
"100%|██████████| 4/4 [00:00<00:00, 6.31it/s]\n"
11031103
]
11041104
},
11051105
{
@@ -1113,7 +1113,7 @@
11131113
"name": "stderr",
11141114
"output_type": "stream",
11151115
"text": [
1116-
"100%|██████████| 3/3 [00:01<00:00, 1.79it/s]\n"
1116+
"100%|██████████| 4/4 [00:00<00:00, 6.58it/s]\n"
11171117
]
11181118
},
11191119
{
@@ -1127,7 +1127,7 @@
11271127
"name": "stderr",
11281128
"output_type": "stream",
11291129
"text": [
1130-
"100%|██████████| 3/3 [00:01<00:00, 1.94it/s]\n"
1130+
"100%|██████████| 4/4 [00:00<00:00, 6.79it/s]\n"
11311131
]
11321132
},
11331133
{
@@ -1141,15 +1141,15 @@
11411141
"name": "stderr",
11421142
"output_type": "stream",
11431143
"text": [
1144-
"100%|██████████| 3/3 [00:01<00:00, 1.56it/s]\n"
1144+
"100%|██████████| 4/4 [00:00<00:00, 6.14it/s]\n"
11451145
]
11461146
}
11471147
],
11481148
"source": [
11491149
"import yaml\n",
11501150
"import tqdm\n",
11511151
"import importlib\n",
1152-
"mdb = importlib.import_module(\"src.evaluator.src.evaluation.db\")\n",
1152+
"mdb = importlib.import_module(\"eval-app.src.evaluator.src.evaluation.db\")\n",
11531153
"\n",
11541154
"def loadYML(file_path):\n",
11551155
" data = None\n",
@@ -1165,7 +1165,7 @@
11651165
"system_prompt = loadYML(config_path / 'raw' / 'system_prompt.yaml')\n",
11661166
"providers = loadYML(config_path / 'raw' / 'providers.yaml')\n",
11671167
"\n",
1168-
"num_runs = {'base-model': 1, 'rag': 1, 'agentic': 3}\n",
1168+
"num_runs = {'base-model': 1, 'rag': 1, 'mcp': 1, 'agentic': 3}\n",
11691169
"\n",
11701170
"for prompt_type in ['basic-prompts', 'tox-type-assertion-prompts']:\n",
11711171
" for species in ['human', 'rat']:\n",
@@ -1175,7 +1175,7 @@
11751175
" for prompt in loadYML(config_path / f'{prompt_type}_{species}.yaml')['prompts']:\n",
11761176
" prompts_vars_asserts.append({'prompt': prompt, 'tests': tests})\n",
11771177
"\n",
1178-
" for framework_type in tqdm.tqdm(['base-model', 'rag', 'agentic']):\n",
1178+
" for framework_type in tqdm.tqdm(['base-model', 'rag', 'mcp', 'agentic']):\n",
11791179
" d = {'description': f'Tests on {framework_type} framework with {prompt_type} prompts for {species}',\n",
11801180
" 'num_runs': num_runs[framework_type]}\n",
11811181
" d |= system_prompt\n",
@@ -1194,7 +1194,7 @@
11941194
" prompts_vars_asserts = []\n",
11951195
" for prompt in loadYML(config_path / f'{prompt_type}_mixed.yaml')['prompts']:\n",
11961196
" prompts_vars_asserts.append({'prompt': prompt, 'tests': tests})\n",
1197-
" for framework_type in tqdm.tqdm(['base-model', 'rag', 'agentic']):\n",
1197+
" for framework_type in tqdm.tqdm(['base-model', 'rag', 'mcp', 'agentic']):\n",
11981198
" d = {'description': f'Tests on {framework_type} framework with {prompt_type} prompts',\n",
11991199
" 'num_runs': num_runs[framework_type]}\n",
12001200
" d |= system_prompt\n",

eval-app/src/evaluator/src/evaluation/executor.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,21 @@ def queryToxPipeRAG(self):
6262

6363
return {'output': res.get('response', str(res)),
6464
'error': res.get('error', '')}
65+
66+
def queryToxPipeMCP(self):
67+
68+
prompt = self.prompt_info['user'].format(**self.vars_info)
69+
70+
model_params = '&'.join([f'{k}={v}' for k, v in self.model_info['config'].items()])
71+
url = f'{Config.env_config['TOXPIPE_MCP_API_HOST']}/mcp/'
72+
api_key = Config.env_config.get('TOXPIPE_API_API_KEY', '')
73+
headers = { "Authorization": f"Bearer {api_key}" } if api_key else {}
74+
response = requests.get(url=f"{url}?query={prompt}&{model_params}", headers=headers, verify=self.cert_path)
75+
if not response.ok: raise Exception(f'API url: {url}, query: {prompt}, Model params: {model_params}, Response status code: {response.status_code}, Response: {response.text}')
76+
res = response.json()
77+
78+
return {'output': res.get('response', str(res)),
79+
'error': res.get('error', '')}
6580

6681
def queryToxPipeAgentic(self):
6782

@@ -71,6 +86,7 @@ def queryToxPipeAgentic(self):
7186

7287
model_params = '&'.join([f'{k}={v}' for k, v in self.model_info['config'].items()])
7388
url = f'{Config.env_config['TOXPIPE_API_HOST']}/agent/create/'
89+
7490
response = requests.get(url=f"{url}?{model_params}", verify=self.cert_path, timeout=None)
7591
if not response.ok: raise Exception(response.text)
7692

0 commit comments

Comments
 (0)