add demo-document-indexing example to LLM app repo (#5627)

GitOrigin-RevId: 16f2ba3cb780db206b30b47eb705e4e744ac0bcc
pathwaycom · Feb 7, 2024 · 1da6e11 · 1da6e11
1 parent b8fb490
commit 1da6e11
Show file tree

Hide file tree

Showing 14 changed files with 134 additions and 0 deletions.
diff --git a/...indexing/files-for-indexing/Always up-to-date Vector Data Indexing pipeline _ Pathway.pdf b/...indexing/files-for-indexing/Always up-to-date Vector Data Indexing pipeline _ Pathway.pdf
diff --git a/examples/pipelines/demo-document-indexing/files-for-indexing/Build an LLM App _ Pathway.pdf b/examples/pipelines/demo-document-indexing/files-for-indexing/Build an LLM App _ Pathway.pdf
diff --git a/...es/pipelines/demo-document-indexing/files-for-indexing/Launching Pathway + LlamaIndex.pdf b/...es/pipelines/demo-document-indexing/files-for-indexing/Launching Pathway + LlamaIndex.pdf
diff --git a/...ing/files-for-indexing/Realtime Classification with Nearest Neighbors (1_2) _ Pathway.pdf b/...ing/files-for-indexing/Realtime Classification with Nearest Neighbors (1_2) _ Pathway.pdf
diff --git a/...nes/demo-document-indexing/files-for-indexing/Realtime Twitter Analysis App _ Pathway.pdf b/...nes/demo-document-indexing/files-for-indexing/Realtime Twitter Analysis App _ Pathway.pdf
diff --git a/...mo-document-indexing/files-for-indexing/Use LLMs to Ingest Raw Text into DB _ Pathway.pdf b/...mo-document-indexing/files-for-indexing/Use LLMs to Ingest Raw Text into DB _ Pathway.pdf
diff --git a/examples/pipelines/demo-document-indexing/files-for-indexing/arxiv 2307.13116.pdf b/examples/pipelines/demo-document-indexing/files-for-indexing/arxiv 2307.13116.pdf
diff --git a/examples/pipelines/demo-document-indexing/files-for-indexing/pw.io.http _ Pathway.pdf b/examples/pipelines/demo-document-indexing/files-for-indexing/pw.io.http _ Pathway.pdf
diff --git a/examples/pipelines/demo-document-indexing/main.py b/examples/pipelines/demo-document-indexing/main.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+
+from pathway.xpacks.llm.embedders import OpenAIEmbedder
+from pathway.xpacks.llm.parsers import ParseUnstructured
+from pathway.xpacks.llm.splitters import TokenCountSplitter
+from pathway.xpacks.llm.vector_store import VectorStoreServer
+from sources.gdrive import get_table as get_gdrive_table
+from sources.local import get_table as get_local_table
+from sources.sharepoint import get_table as get_sharepoint_table
+
+
+def data_sources(source_types):
+    parsed_source_types = set([x.strip().lower() for x in source_types.split(",")])
+    sources = []
+    if "local" in parsed_source_types:
+        sources.append(get_local_table())
+    if "sharepoint" in parsed_source_types:
+        sources.append(get_sharepoint_table())
+    if "gdrive" in parsed_source_types:
+        sources.append(get_gdrive_table())
+    return sources
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="Pathway realtime document indexing demo",
+        description="""
+        This is the demo of real-time indexing of the documents from various data sources.
+        It runs a simple web server that is capable of answering queries on the endpoints
+        /v1/retrieve, /v1/statistics, /v1/inputs. Please refer to the "Test the REST endpoint"
+        section at Hosted Pipelines website: https://cloud.pathway.com.
+
+        Currently, it supports several data sources: the local one, Google Drive, and Microsoft SharePoint.
+
+        For the demo, you need to store your Open AI key in the OPENAI_API_KEY environment variable.
+        """,
+    )
+    parser.add_argument(
+        "--host",
+        help="Host that will be used for running the web server",
+        default="0.0.0.0",
+    )
+    parser.add_argument(
+        "--port",
+        help="Port that will be used by the web server",
+        type=int,
+        default=21401,
+    )
+    parser.add_argument(
+        "--source-types",
+        help="Comma-separated source types to be used. "
+        "Possible options are local, gdrive, sharepoint. If the local "
+        "source is chosen, it will read documents from the top level of "
+        "the 'files-for-indexing/' folder",
+        default="local",
+    )
+    args = parser.parse_args()
+
+    splitter = TokenCountSplitter(max_tokens=1000)
+    embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"])
+    parser = ParseUnstructured()
+    vs_server = VectorStoreServer(
+        *data_sources(args.source_types),
+        embedder=embedder,
+        splitter=splitter,
+        parser=parser,
+    )
+    vs_server.run_server(
+        host=args.host,
+        port=args.port,
+        threaded=False,
+        with_cache=True,
+    )
diff --git a/examples/pipelines/demo-document-indexing/sources/__init__.py b/examples/pipelines/demo-document-indexing/sources/__init__.py
diff --git a/examples/pipelines/demo-document-indexing/sources/common.py b/examples/pipelines/demo-document-indexing/sources/common.py
@@ -0,0 +1 @@
+FILE_SIZE_LIMIT = 4 * 1024 * 1024  # don't process files larger than 4 MB
diff --git a/examples/pipelines/demo-document-indexing/sources/gdrive.py b/examples/pipelines/demo-document-indexing/sources/gdrive.py
@@ -0,0 +1,20 @@
+from os.path import dirname, realpath
+
+import pathway.io.gdrive as gdrive_connector
+
+from .common import FILE_SIZE_LIMIT
+
+GDRIVE_CONFIG = {
+    "object_id": "1cULDv2OaViJBmOfG5WB0oWcgayNrGtVs",
+    "service_user_credentials_file": f"{realpath(dirname(__file__))}/secrets/gdrive/gdrive_indexer.json",
+    "refresh_interval": 5,
+}
+
+
+def get_table():
+    gdrive_table = gdrive_connector.read(
+        **GDRIVE_CONFIG,
+        with_metadata=True,
+        object_size_limit=FILE_SIZE_LIMIT,
+    )
+    return gdrive_table
diff --git a/examples/pipelines/demo-document-indexing/sources/local.py b/examples/pipelines/demo-document-indexing/sources/local.py
@@ -0,0 +1,12 @@
+import pathway.io.fs as filesystem_connector
+
+FILESYSTEM_CONFIG = {
+    "path": "files-for-indexing/",
+    "format": "binary",
+    "with_metadata": True,
+}
+
+
+def get_table():
+    filesystem_table = filesystem_connector.read(**FILESYSTEM_CONFIG)
+    return filesystem_table
diff --git a/examples/pipelines/demo-document-indexing/sources/sharepoint.py b/examples/pipelines/demo-document-indexing/sources/sharepoint.py
@@ -0,0 +1,27 @@
+import os
+from os.path import dirname, realpath
+
+from pathway.xpacks.connectors import sharepoint as sharepoint_connector
+
+from .common import FILE_SIZE_LIMIT
+
+SHAREPOINT_MUTABLE_COLLECTION_PATH = "Shared Documents/IndexerSandbox"
+
+SHAREPOINT_SITE_CONFIG = {
+    "url": "https://navalgo.sharepoint.com/sites/ConnectorSandbox",
+    "tenant": os.environ["SHAREPOINT_TENANT"],
+    "client_id": os.environ["SHAREPOINT_CLIENT_ID"],
+    "cert_path": f"{realpath(dirname(__file__))}/secrets/sharepoint/sharepointcert.pem",
+    "thumbprint": os.environ["SHAREPOINT_THUMBPRINT"],
+    "refresh_interval": 5,
+}
+
+
+def get_table():
+    sharepoint_table = sharepoint_connector.read(
+        **SHAREPOINT_SITE_CONFIG,
+        root_path=SHAREPOINT_MUTABLE_COLLECTION_PATH,
+        with_metadata=True,
+        object_size_limit=FILE_SIZE_LIMIT,
+    )
+    return sharepoint_table