Merge branch 'vllm' into main

francescortu · web-flow · commit e93ccf8e200a · 2025-07-14T11:51:56.000+02:00
diff --git a/easyroutine/inference/__init__.py b/easyroutine/inference/__init__.py
@@ -1,2 +1,3 @@
 from easyroutine.inference.base_model_interface import BaseInferenceModelConfig
-from easyroutine.inference.vllm_model_interface import VLLMInferenceModel, VLLMInferenceModelConfig
+from easyroutine.inference.vllm_model_interface import VLLMInferenceModel, VLLMInferenceModelConfig
+from easyroutine.inference.litellm_model_interface import LiteLLMInferenceModel, LiteLLMInferenceModelConfig
diff --git a/easyroutine/inference/litellm_model_interface.py b/easyroutine/inference/litellm_model_interface.py
@@ -0,0 +1,83 @@
+from easyroutine.inference.base_model_interface import BaseInferenceModel, BaseInferenceModelConfig
+from vllm import LLM, SamplingParams
+from typing import Union, List, Literal
+from dataclasses import dataclass
+from litellm import completion, batch_completion
+
+@dataclass
+class LiteLLMInferenceModelConfig(BaseInferenceModelConfig):
+    """just a placeholder for now, as we don't have any specific config for VLLM."""
+    model_name: str
+
+    n_gpus: int = 0
+    dtype: str = 'bfloat16'
+    temperature: float = 0
+    top_p: float = 0.95
+    max_new_tokens: int = 5000
+    
+    openai_api_key: str = ''
+    anthropic_api_key: str = ''
+    xai_api_key: str = ''
+    
+class LiteLLMInferenceModel(BaseInferenceModel):
+    
+    def __init__(self, config: LiteLLMInferenceModelConfig):
+        self.config = config
+        self.set_os_env()
+        
+    def set_os_env(self):
+        import os
+        os.environ['OPENAI_API_KEY'] = self.config.openai_api_key
+        os.environ['ANTHROPIC_API_KEY'] = self.config.anthropic_api_key
+        os.environ['XAI_API_KEY'] = self.config.xai_api_key
+        
+    def convert_chat_messages_to_custom_format(self, chat_messages: List[dict[str, str]]) -> List[dict[str, str]]:
+        """
+        For now, VLLM is compatible with the chat template format we use.
+        """
+        return chat_messages
+
+    def chat(self, chat_messages: List[dict[str, str]], use_tqdm=False, **kwargs) -> list:
+        """
+        Generate a response based on the provided chat messages.
+        
+        Arguments:
+            chat_messages (List[dict[str, str]]): List of chat messages to process.
+            **kwargs: Additional parameters for the model.
+        
+        Returns:
+            str: The generated response from the model.
+        """
+        chat_messages = self.convert_chat_messages_to_custom_format(chat_messages)
+        
+        
+        response = completion(
+            model = self.config.model_name,
+            messages = chat_messages,
+            temperature = self.config.temperature,
+            top_p = self.config.top_p,
+            max_tokens = self.config.max_new_tokens,
+        )
+        return response['choices']
+    
+    def batch_chat(self, chat_messages: List[List[dict[str, str]]], use_tqdm=False, **kwargs) -> List[list]:
+        """
+        Generate responses for a batch of chat messages.
+        
+        Arguments:
+            chat_messages (List[List[dict[str, str]]]): List of chat messages to process.
+            **kwargs: Additional parameters for the model.
+        
+        Returns:
+            List[list]: List of generated responses from the model.
+        """
+        chat_messages = [self.convert_chat_messages_to_custom_format(msg) for msg in chat_messages]
+        
+        responses = batch_completion(
+            model = self.config.model_name,
+            messages = chat_messages,
+            temperature = self.config.temperature,
+            top_p = self.config.top_p,
+            max_tokens = self.config.max_new_tokens,
+        )
+        return responses
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,3 +24,4 @@ pyyaml = "^6.0.2"
 sentencepiece = "^0.2.0"
 transformers = "4.51.1"
 pydantic = "^2.10.6"
+litellm = "^1.74.0"
diff --git a/tutorial/5_inference.ipynb b/tutorial/5_inference.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2e41c136",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Changed working directory to: /home/francesco/HistoryRevisionismLLM\n"
+     ]
+    }
+   ],
+   "source": [
+    "from easyroutine import path_to_parents\n",
+    "path_to_parents(2)\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f0911d1",
+   "metadata": {},
+   "source": [
+    "# Inference Module\n",
+    "`easyroutine` provide a simple interface to interact with various LLMs using different backends. Specifically, it supports:\n",
+    "- **vLLM**: A high-performance inference engine for large language models running on GPUs.\n",
+    "- **LiteLLM**: A lightweight interface for OpenAI, Anthropic, and XAI APIs.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "246e7dea",
+   "metadata": {},
+   "source": [
+    "## LiteLLM Inference Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44a974d9",
+   "metadata": {},
+   "source": [
+    "First load the api keys from the `.env` file:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "239a674d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "#get the openai api key from the .env file\n",
+    "import os\n",
+    "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da43952b",
+   "metadata": {},
+   "source": [
+    "Then, init the interface with the desired model and API keys:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4ba22f61",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/francesco/HistoryRevisionismLLM/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 07-14 11:25:17 [__init__.py:244] Automatically detected platform cuda.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from easyroutine.inference import LiteLLMInferenceModel, LiteLLMInferenceModelConfig\n",
+    "config = LiteLLMInferenceModelConfig(\n",
+    "    model_name=\"gpt-4.1-nano-2025-04-14\",\n",
+    "    openai_api_key=OPENAI_API_KEY\n",
+    ")\n",
+    "model = LiteLLMInferenceModel(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8f26ca4",
+   "metadata": {},
+   "source": [
+    "All the models are available in the `easyroutine.inference` module have the `.append_with_chat_template` method to append a message to the chat history with the specified role (either \"user\" or \"assistant\"). The `.chat` method than will handle the translation of the chat history to the specific model format and return the response.\n",
+    "\n",
+    "`append_with_chat_template` method take a message and a role as input, and returns a chat message in the format required by the model. It can also take a `chat_history` parameter to append the message to an existing chat history.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d72acaa7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'role': 'user', 'content': 'What is the capital of France?'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "chat_message = model.append_with_chat_template(message=\"What is the capital of France?\", role=\"user\")\n",
+    "print(chat_message)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "add7f811",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Choices(finish_reason='stop', index=0, message=Message(content='The capital of France is Paris.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = model.chat(chat_message)\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "740842a0",
+   "metadata": {},
+   "source": [
+    "## Batched inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8ea393c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = [\n",
+    "    model.append_with_chat_template(message=\"What is the capital of Italy?\", role=\"user\"),\n",
+    "    model.append_with_chat_template(message=\"What is the capital of Germany?\", role=\"user\"),\n",
+    "    model.append_with_chat_template(message=\"What is the capital of Spain?\", role=\"user\"),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5dc3093f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ModelResponse(id='chatcmpl-Bt9jD8Iy4A9h6OyHUuqQEN8s7qqqq', created=1752485135, model='gpt-4.1-nano-2025-04-14', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of Italy is Rome.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=7, prompt_tokens=14, total_tokens=21, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'), ModelResponse(id='chatcmpl-Bt9jDAATOtSORl4CtVSBblbsH3oX4', created=1752485135, model='gpt-4.1-nano-2025-04-14', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of Germany is Berlin.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=7, prompt_tokens=14, total_tokens=21, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default'), ModelResponse(id='chatcmpl-Bt9jDPPEsRluyqeDvuXYKHPJhWVYE', created=1752485135, model='gpt-4.1-nano-2025-04-14', object='chat.completion', system_fingerprint='fp_38343a2f8f', choices=[Choices(finish_reason='stop', index=0, message=Message(content='The capital of Spain is Madrid.', role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=7, prompt_tokens=14, total_tokens=21, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = model.batch_chat(inputs)\n",
+    "print([response[i][\"choices\"][0][\"message\"].content for i in range(len(response))]  # Extract the content of the responses)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}