Bug fixes and README update

bibekyess · bibekyess · commit 2fa19f24f19c · 2024-11-07T16:49:19.000Z
diff --git a/Dockerfile_parser_api b/Dockerfile_parser_api
@@ -20,7 +20,7 @@ RUN pip install gradio==4.44.1 requests==2.32.3
 # Install cpu version for lightweight Docker Image
 RUN pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu
 
-ARG CACHEBUST=3
+ARG CACHEBUST=4
 RUN pip install git+https://github.com/bibekyess/FastRAG.git
 
 COPY fastrag/api.py /app/api.py
diff --git a/README.md b/README.md
@@ -1,11 +1,103 @@
 # FastRAG
-A simple RAG application that is optimized to run fast on general grade PCs
 
+FastRAG is a simple Retrieval-Augmented Generation (RAG) application optimized for fast performance on general-grade PCs. It provides a chatbot interface that leverages vector-based search and large language models (LLMs) for answering questions and interacting with document-based data.
 
-# TODO
-- [ ] Make stubbs test
+---
 
+### 🚀 Launch API and Demo Locally
 
-`pip install llama-index-embeddings-huggingface` may install unnecessary nvidia-cuda libraries be careful to install cpuonly stuffs
+To get started with FastRAG locally, follow these steps:
 
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/bibekyess/FastRAG.git
+   ```
+   
+2. Navigate to the project directory:
+   ```bash
+   cd FastRAG
+   ```
+
+3. Build and launch the containers:
+   ```bash
+   docker compose up --build
+   ```
+
+This will start the FastRAG API and demo with all necessary services.
+
+---
+
+### 🛠️ API Endpoints
+
+The FastRAG application launches several API endpoints for different purposes:
+
+1. **Get Conversation History**  
+   - **Method**: `GET`  
+   - **Endpoint**: `/conversation-history`  
+   - **Parameters**:  
+     - `collection_name` (str): Name of the collection to fetch history from.  
+     - `limit` (int): Number of history entries to return. Default is 10.
+
+2. **Add to Conversation History**  
+   - **Method**: `POST`  
+   - **Endpoint**: `/conversation-history`  
+   - **Body**:
+     - `collection_name` (str): Name of the collection to fetch history from.  
+     - `query` (str): User input query 
+     - `response_text` (str): AI response  
+
+3. **Parse Document**  
+   - **Method**: `POST`  
+   - **Endpoint**: `/parse`  
+   - **Parameters**:  
+     - `file` (UploadFile): The document to be parsed.  
+     - `index_id` (str): Index name for the document. Default is `files`.  
+     - `splitting_type` (Literal['raw', 'md']): Splitting type for the document. Default is `raw` (based on chunk settings).
+
+4. **Chat with the Bot**  
+   - **Method**: `POST`  
+   - **Endpoint**: `/chat`  
+   - **Body**: 
+     - `user_input` (str): The user's query.  
+     - `index_id` (str): The index to search. Default is `"files"`.  
+     - `llm_text` (str): The LLM model to use. Default is `"local"`.  
+     - `dense_top_k` (int): The number of top results to return from the vector search. Default is 5.  
+     - `upgrade_user_input` (bool): Flag to indicate whether to upgrade the user input from conversation history. Default is `False`.  
+     - `stream` (bool): Flag to enable streaming of results. Default is `True`.  
+
+
+### 🖥️ User Interface
+
+- **Gradio UI**: FastRAG features a simple Gradio-based user interface for interacting with the chatbot.
+- **Real-time Chat**: Users can upload a document and ask questions in real-time, with previous conversations stored and utilized for context-based improvements. [Providing the option to upload document is in progress]
+
+---
+
+### 🗃️ Storage and Database
+
+- **QdrantDB**: The vector embeddings and chatbot conversation history are stored in QdrantDB. This allows the chatbot to utilize previous conversation context for improved responses.
+---
+
+### ⚡ Model Backend
+
+- **Model**: [bartowski/Llama-3.2-3B-Instruct-GGUF](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) 
+
+---
+
+### ⏱️ Latency Tracking
+
+- **UI Display**: Latency of the chatbot's response is displayed in the Gradio interface.
+- **Logging**: Detailed logs of latency and other events are saved for debugging and performance monitoring.
+
+---
+
+
+### 🧾 Document Parsing Options
+
+FastRAG offers multiple options for segmenting documents into chunks:
+
+1. **Raw Format**: This option allows experimenting with various chunk sizes, strides, and overlapping settings for raw text parsing.
+2. **Markdown Format**: This method segments the document based on semantic information, creating more context-aware chunks.
+
+---
 
diff --git a/__init__.py b/__init__.py
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -10,7 +10,8 @@ services:
     environment:
       - PARSER_API_URL=http://parser-api:8090/chat
       - CONVERSATION_HISTORY_URL=http://parser-api:8090/conversation-history
-    
+    volumes:
+      - ./logs:/app/logs 
 
   parser-api:
     build:
diff --git a/fastrag/api.py b/fastrag/api.py
@@ -27,10 +27,10 @@
 logger.addHandler(file_handler)
 
 # Set up stream handler to print logs to the terminal
-stream_handler = logging.StreamHandler()
-stream_handler.setLevel(logging.INFO)
-stream_handler.setFormatter(formatter)
-logger.addHandler(stream_handler)
+# stream_handler = logging.StreamHandler()
+# stream_handler.setLevel(logging.INFO)
+# stream_handler.setFormatter(formatter)
+# logger.addHandler(stream_handler)
 
 
 qdrant_url = os.getenv("QDRANT_URL", "http://0.0.0.0:6333")
@@ -154,7 +154,7 @@ async def parse(file: UploadFile = File(...), index_id: str="files", splitting_t
     return base_retriever.add_documents_to_index(documents=documents, index_id=index_id)
 
 
-def llamacpp_inference(prompt, n_predict=128, temperature=0.7, top_p=0.95, stop=None, stream=True):
+def llamacpp_inference(prompt, n_predict=512, temperature=0.7, top_p=0.95, stop=None, stream=True):
     url = os.getenv("LLAMACPP_URL", "http://localhost:8088/completion")
     
     payload = {
@@ -220,11 +220,11 @@ async def chat(request: ChatRequest):
     passed_llm_prompt = LLM_PROMPT.format(context_str=context_str, query_str=user_input)
     logger.info(f"passed llm prompt: {str(passed_llm_prompt)}")
     if stream:
-        streamer = llamacpp_inference(passed_llm_prompt, n_predict=200, temperature=0.3, stream=stream)
+        streamer = llamacpp_inference(passed_llm_prompt, n_predict=512, temperature=0.4, stream=stream)
 
         return StreamingResponse(streamer, media_type="text/plain")
     else:
-        response = llamacpp_inference(passed_llm_prompt, n_predict=200, temperature=0.3, stream=stream)
+        response = llamacpp_inference(passed_llm_prompt, n_predict=512, temperature=0.4, stream=stream)
         return {'response': response}
     
 
diff --git a/fastrag/demo_ui.py b/fastrag/demo_ui.py
@@ -4,11 +4,27 @@
 import logging
 import time
 from time import perf_counter
+from datetime import datetime
+
 
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 
 
+def log_to_file(question, response, latency):
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    
+    log_entry = f"Timestamp: {timestamp}\n"
+    log_entry += f"Question: {question}\n"
+    log_entry += f"Response: {response}\n"
+    log_entry += f"Latency: {latency:.4f} seconds\n"
+    log_entry += "-" * 50 + "\n"
+    
+    # append mode
+    with open("./logs/chatbot_log.txt", "a") as file:
+        file.write(log_entry)
+        
+    
 def call_chat_api(user_input):
     url = os.getenv("PARSER_API_URL", "http://localhost:8080/chat")
     headers = {
@@ -20,6 +36,7 @@ def call_chat_api(user_input):
         "index_id": "files",
         "llm_text": "local",
         "dense_top_k": 4,
+        "upgrade_user_input": True,
         "stream": True
     }
 
@@ -52,6 +69,9 @@ def chat(chatbot_history):
     elapsed_time = end_time-start_time
     logger.info(f"Chat API executed in {elapsed_time:.4f} seconds")
     
+    # Log the results to a .txt file
+    log_to_file(query, response_text, elapsed_time)
+    
     yield chatbot_history, f"## Latency of Last Response: {elapsed_time:.4f} seconds"
 
 
diff --git a/fastrag/notebooks/qdrant_sandbox.ipynb b/fastrag/notebooks/qdrant_sandbox.ipynb
@@ -31,6 +31,21 @@
     "qdrant_database.create_collection(collection_name)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# qdrant_database.qdrant_client.scroll(\n",
+    "#     collection_name=collection_name,\n",
+    "#     limit=100000,\n",
+    "#     order_by=\"id\"\n",
+    "# )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -40,14 +55,19 @@
     "qdrant_database.load_recent_responses(collection_name=collection_name, limit=20)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# qdrant_database.add_response(collection_name, \"what are you doing?\", \"I am studying\")\n",
-    "qdrant_database.add_response(collection_name, \"tell me about Bib\", \"It is a  country.\")"
+    "qdrant_database.add_response(collection_name, \"tell me about Bib\", \"It is a  boy.\")"
    ]
   },
   {
diff --git a/fastrag/utilities/qdrant_database.py b/fastrag/utilities/qdrant_database.py
@@ -48,11 +48,10 @@ def add_response(self, collection_name, query, response_text):
     def load_recent_responses(self, collection_name, limit: int=10):
         search_results = self.qdrant_client.scroll(
             collection_name = collection_name,
+            limit=10e16, # Retrieve every points
             with_payload=True
-        )[0]
-        
-        print(len(search_results))
-        
+        )
+        search_results = search_results[0]
         return [response.payload for response in search_results[-limit:]]
     
     def delete_collection(self, collection_name):