fix: add optional graphrag toggle in dockerfile (#377)

taprosoft · web-flow · commit 6da9db489fb5 · 2024-10-10T16:09:57.000+07:00
* fix: toggle graphrag install in Docker build

* fix: update Dockerfile

* fix: remove unused logics in chat_fn

* fix: disable duckduckgo test due to API limit
diff --git a/.github/workflows/build-push-docker.yaml b/.github/workflows/build-push-docker.yaml
@@ -88,16 +88,34 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Build docker image
+      - name: Build docker image (amd64)
         uses: docker/build-push-action@v6
         with:
           file: Dockerfile
           context: .
           push: true
-          platforms: linux/amd64,linux/arm64
+          platforms: linux/amd64
           tags: |
             ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           target: ${{ matrix.target }}
           cache-from: type=gha
           cache-to: type=gha,mode=max
+          build-args: |
+            ENABLE_GRAPHRAG=true
+
+      - name: Build docker image (arm64)
+        uses: docker/build-push-action@v6
+        with:
+          file: Dockerfile
+          context: .
+          push: true
+          platforms: linux/arm64
+          tags: |
+            ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          target: ${{ matrix.target }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            ENABLE_GRAPHRAG=false
diff --git a/Dockerfile b/Dockerfile
@@ -14,10 +14,14 @@ RUN apt-get update -qqy && \
       curl \
       cargo
 
+# Setup args
+ARG ENABLE_GRAPHRAG=true
+
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONIOENCODING=UTF-8
+ENV ENABLE_GRAPHRAG=${ENABLE_GRAPHRAG}
 
 # Create working directory
 WORKDIR /app
@@ -30,15 +34,19 @@ RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
 
 # Copy contents
 COPY . /app
+COPY .env.example /app/.env
 
 # Install pip packages
 RUN --mount=type=ssh  \
     --mount=type=cache,target=/root/.cache/pip  \
     pip install -e "libs/kotaemon" \
     && pip install -e "libs/ktem" \
-    && pip install graphrag future \
     && pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
 
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    if [ "$ENABLE_GRAPHRAG" = "true" ]; then pip install graphrag future; fi
+
 # Clean up
 RUN apt-get autoremove \
     && apt-get clean \
@@ -66,10 +74,6 @@ RUN --mount=type=ssh  \
     --mount=type=cache,target=/root/.cache/pip  \
     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
 
-# Copy contents
-COPY . /app
-COPY .env.example /app/.env
-
 # Install additional pip packages
 RUN --mount=type=ssh  \
     --mount=type=cache,target=/root/.cache/pip  \
diff --git a/libs/kotaemon/tests/test_agent.py b/libs/kotaemon/tests/test_agent.py
@@ -98,15 +98,15 @@ def generate_chat_completion_obj(text):
             "Action: wikipedia\n"
             "Action Input: Cinnamon AI company\n"
         ),
-        (
-            "The information retrieved from Wikipedia is not "
-            "about Cinnamon AI company, but about Blue Prism, "
-            "a British multinational software corporation. "
-            "I need to try another source to gather information "
-            "about Cinnamon AI company.\n"
-            "Action: duckduckgo_search\n"
-            "Action Input: Cinnamon AI company\n"
-        ),
+        # (
+        #     "The information retrieved from Wikipedia is not "
+        #     "about Cinnamon AI company, but about Blue Prism, "
+        #     "a British multinational software corporation. "
+        #     "I need to try another source to gather information "
+        #     "about Cinnamon AI company.\n"
+        #     "Action: duckduckgo_search\n"
+        #     "Action Input: Cinnamon AI company\n"
+        # ),
         FINAL_RESPONSE_TEXT,
     ]
 ]
diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py
@@ -1,14 +1,10 @@
 import asyncio
-import csv
 import json
 import re
 from copy import deepcopy
-from datetime import datetime
-from pathlib import Path
 from typing import Optional
 
 import gradio as gr
-from filelock import FileLock
 from ktem.app import BasePage
 from ktem.components import reasonings
 from ktem.db.models import Conversation, engine
@@ -269,10 +265,6 @@ def on_register_events(self):
                     self._suggestion_updated,
                     self._app.user_id,
                 ],
-                outputs=[
-                    self.chat_control.conversation,
-                    self.chat_control.conversation,
-                ],
                 show_progress="hidden",
             )
 
@@ -372,10 +364,6 @@ def on_register_events(self):
                     self._suggestion_updated,
                     self._app.user_id,
                 ],
-                outputs=[
-                    self.chat_control.conversation,
-                    self.chat_control.conversation,
-                ],
                 show_progress="hidden",
             )
 
@@ -995,96 +983,3 @@ def suggest_chat_conv(self, settings, chat_history):
                     pass
 
         return suggested_ques, updated
-
-    def backup_original_info(
-        self, chat_history, settings, info_pannel, original_chat_history
-    ):
-        original_chat_history.append(chat_history[-1])
-        return original_chat_history, settings, info_pannel
-
-    def save_log(
-        self,
-        conversation_id,
-        chat_history,
-        settings,
-        info_panel,
-        original_chat_history,
-        original_settings,
-        original_info_panel,
-        log_dir,
-    ):
-        if not Path(log_dir).exists():
-            Path(log_dir).mkdir(parents=True)
-
-        lock = FileLock(Path(log_dir) / ".lock")
-        # get current date
-        today = datetime.now()
-        formatted_date = today.strftime("%d%m%Y_%H")
-
-        with Session(engine) as session:
-            statement = select(Conversation).where(Conversation.id == conversation_id)
-            result = session.exec(statement).one()
-
-            data_source = deepcopy(result.data_source)
-            likes = data_source.get("likes", [])
-            if not likes:
-                return
-
-        feedback = likes[-1][-1]
-        message_index = likes[-1][0]
-
-        current_message = chat_history[message_index[0]]
-        original_message = original_chat_history[message_index[0]]
-        is_original = all(
-            [
-                current_item == original_item
-                for current_item, original_item in zip(
-                    current_message, original_message
-                )
-            ]
-        )
-
-        dataframe = [
-            [
-                conversation_id,
-                message_index,
-                current_message[0],
-                current_message[1],
-                chat_history,
-                settings,
-                info_panel,
-                feedback,
-                is_original,
-                original_message[1],
-                original_chat_history,
-                original_settings,
-                original_info_panel,
-            ]
-        ]
-
-        with lock:
-            log_file = Path(log_dir) / f"{formatted_date}_log.csv"
-            is_log_file_exist = log_file.is_file()
-            with open(log_file, "a") as f:
-                writer = csv.writer(f)
-                # write headers
-                if not is_log_file_exist:
-                    writer.writerow(
-                        [
-                            "Conversation ID",
-                            "Message ID",
-                            "Question",
-                            "Answer",
-                            "Chat History",
-                            "Settings",
-                            "Evidences",
-                            "Feedback",
-                            "Original/ Rewritten",
-                            "Original Answer",
-                            "Original Chat History",
-                            "Original Settings",
-                            "Original Evidences",
-                        ]
-                    )
-
-                writer.writerows(dataframe)
diff --git a/libs/ktem/ktem/pages/chat/control.py b/libs/ktem/ktem/pages/chat/control.py
@@ -326,11 +326,7 @@ def persist_chat_suggestions(
     ):
         """Update the conversation's chat suggestions"""
         if not is_updated:
-            return (
-                gr.update(),
-                conversation_id,
-                gr.update(visible=False),
-            )
+            return
 
         if user_id is None:
             gr.Warning("Please sign in first (Settings → User Settings)")
@@ -353,13 +349,7 @@ def persist_chat_suggestions(
             session.add(result)
             session.commit()
 
-        history = self.load_chat_history(user_id)
         gr.Info("Chat suggestions updated.")
-        return (
-            gr.update(choices=history),
-            conversation_id,
-            gr.update(visible=False),
-        )
 
     def _on_app_created(self):
         """Reload the conversation once the app is created"""