Added MCP Agent to the pipeline

amlantalukder · amlantalukder · commit b1c62f5ddb4f · 2026-02-02T12:46:11.000-05:00
diff --git a/eval-app/src/evaluator/data/toxpipe_eval_info/config/raw/providers.yaml b/eval-app/src/evaluator/data/toxpipe_eval_info/config/raw/providers.yaml
@@ -138,6 +138,82 @@ rag:
       id: ToxPipeRAG
       label: Toxpipe (RAG) [Mistral Large 2]
       func: queryToxPipeRAG
+mcp:
+  providers:
+    - config:
+        model: azure-o3
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [o3]
+      func: queryToxPipeMCP
+    - config:
+        model: azure-gpt-5
+        temp: 0
+        reasoning_effort: high
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [GPT-5 High Reasoning]
+      func: queryToxPipeMCP
+    - config:
+        model: azure-gpt-5
+        temp: 0
+        reasoning_effort: low
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [GPT-5 Low Reasoning]
+      func: queryToxPipeMCP
+    - config:
+        model: azure-gpt-5-nano
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [GPT-5 Nano]
+      func: queryToxPipeMCP
+    - config:
+        model: azure-gpt-4o
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [GPT-4o]
+      func: queryToxPipeMCP
+    - config:
+        model: claude-4.5-haiku
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Claude 4.5 Haiku]
+      func: queryToxPipeMCP
+    - config:
+        model: claude-4.5-sonnet
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Claude 4.5 Sonnet]
+      func: queryToxPipeMCP
+    - config:
+        model: claude-3-7-sonnet
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Claude 3.7 Sonnet]
+      func: queryToxPipeMCP
+    - config:
+        model: gemini-2.5-pro
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Gemini 2.5 Pro]
+      func: queryToxPipeMCP
+    - config:
+        model: gemini-2.5-flash
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Gemini 2.5 Flash]
+      func: queryToxPipeMCP
+    - config:
+        model: llama4-scout-17b-instruct
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Llama 4 Scout 17B (Instruct)]
+      func: queryToxPipeMCP
+    - config:
+        model: mistral-large-2
+        temp: 0
+      id: ToxPipeMCP
+      label: Toxpipe (MCP) [Mistral Large 2]
+      func: queryToxPipeMCP
 agentic:
   providers:
     - config:
diff --git a/eval-app/src/evaluator/notebooks/create_bulk_tests.ipynb b/eval-app/src/evaluator/notebooks/create_bulk_tests.ipynb
@@ -9,7 +9,7 @@
     "import pandas as pd\n",
     "import yaml\n",
     "import importlib\n",
-    "utils = importlib.import_module(\"src.utils\")\n",
+    "utils = importlib.import_module(\"eval-app.src.utils\")\n",
     "\n",
     "class MyDumper(yaml.Dumper):\n",
     "\n",
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -433,7 +433,7 @@
        "[5 rows x 22 columns]"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -446,7 +446,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -744,7 +744,7 @@
        "[5 rows x 22 columns]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -757,7 +757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -773,7 +773,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -966,7 +966,7 @@
        "[474 rows x 6 columns]"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -979,7 +979,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1016,7 +1016,7 @@
        "Index: []"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1071,7 +1071,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -1085,7 +1085,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:02<00:00,  1.48it/s]\n"
+      "100%|██████████| 4/4 [00:00<00:00,  4.90it/s]\n"
      ]
     },
     {
@@ -1099,7 +1099,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:01<00:00,  1.84it/s]\n"
+      "100%|██████████| 4/4 [00:00<00:00,  6.31it/s]\n"
      ]
     },
     {
@@ -1113,7 +1113,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:01<00:00,  1.79it/s]\n"
+      "100%|██████████| 4/4 [00:00<00:00,  6.58it/s]\n"
      ]
     },
     {
@@ -1127,7 +1127,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:01<00:00,  1.94it/s]\n"
+      "100%|██████████| 4/4 [00:00<00:00,  6.79it/s]\n"
      ]
     },
     {
@@ -1141,15 +1141,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 3/3 [00:01<00:00,  1.56it/s]\n"
+      "100%|██████████| 4/4 [00:00<00:00,  6.14it/s]\n"
      ]
     }
    ],
    "source": [
     "import yaml\n",
     "import tqdm\n",
     "import importlib\n",
-    "mdb = importlib.import_module(\"src.evaluator.src.evaluation.db\")\n",
+    "mdb = importlib.import_module(\"eval-app.src.evaluator.src.evaluation.db\")\n",
     "\n",
     "def loadYML(file_path):\n",
     "    data = None\n",
@@ -1165,7 +1165,7 @@
     "system_prompt = loadYML(config_path / 'raw' / 'system_prompt.yaml')\n",
     "providers = loadYML(config_path / 'raw' / 'providers.yaml')\n",
     "\n",
-    "num_runs = {'base-model': 1, 'rag': 1, 'agentic': 3}\n",
+    "num_runs = {'base-model': 1, 'rag': 1, 'mcp': 1, 'agentic': 3}\n",
     "\n",
     "for prompt_type in ['basic-prompts', 'tox-type-assertion-prompts']:\n",
     "    for species in ['human', 'rat']:\n",
@@ -1175,7 +1175,7 @@
     "        for prompt in loadYML(config_path / f'{prompt_type}_{species}.yaml')['prompts']:\n",
     "            prompts_vars_asserts.append({'prompt': prompt, 'tests': tests})\n",
     "\n",
-    "        for framework_type in tqdm.tqdm(['base-model', 'rag', 'agentic']):\n",
+    "        for framework_type in tqdm.tqdm(['base-model', 'rag', 'mcp', 'agentic']):\n",
     "            d = {'description': f'Tests on {framework_type} framework with {prompt_type} prompts for {species}',\n",
     "                 'num_runs': num_runs[framework_type]}\n",
     "            d |= system_prompt\n",
@@ -1194,7 +1194,7 @@
     "    prompts_vars_asserts = []\n",
     "    for prompt in loadYML(config_path / f'{prompt_type}_mixed.yaml')['prompts']:\n",
     "        prompts_vars_asserts.append({'prompt': prompt, 'tests': tests})\n",
-    "    for framework_type in tqdm.tqdm(['base-model', 'rag', 'agentic']):\n",
+    "    for framework_type in tqdm.tqdm(['base-model', 'rag', 'mcp', 'agentic']):\n",
     "        d = {'description': f'Tests on {framework_type} framework with {prompt_type} prompts',\n",
     "             'num_runs': num_runs[framework_type]}\n",
     "        d |= system_prompt\n",
diff --git a/eval-app/src/evaluator/src/evaluation/executor.py b/eval-app/src/evaluator/src/evaluation/executor.py
@@ -62,6 +62,21 @@ def queryToxPipeRAG(self):
 
         return {'output': res.get('response', str(res)), 
                     'error': res.get('error', '')}
+    
+    def queryToxPipeMCP(self):
+
+        prompt = self.prompt_info['user'].format(**self.vars_info)
+
+        model_params = '&'.join([f'{k}={v}' for k, v in self.model_info['config'].items()])
+        url = f'{Config.env_config['TOXPIPE_MCP_API_HOST']}/mcp/'
+        api_key = Config.env_config.get('TOXPIPE_API_API_KEY', '')
+        headers = { "Authorization": f"Bearer {api_key}" } if api_key else {}
+        response = requests.get(url=f"{url}?query={prompt}&{model_params}", headers=headers, verify=self.cert_path)
+        if not response.ok: raise Exception(f'API url: {url}, query: {prompt}, Model params: {model_params}, Response status code: {response.status_code}, Response: {response.text}')
+        res = response.json()
+
+        return {'output': res.get('response', str(res)), 
+                    'error': res.get('error', '')}
 
     def queryToxPipeAgentic(self):
 
@@ -71,6 +86,7 @@ def queryToxPipeAgentic(self):
             
             model_params = '&'.join([f'{k}={v}' for k, v in self.model_info['config'].items()])
             url = f'{Config.env_config['TOXPIPE_API_HOST']}/agent/create/'
+
             response = requests.get(url=f"{url}?{model_params}", verify=self.cert_path, timeout=None)
             if not response.ok: raise Exception(response.text)