Skip to content

Commit

Permalink
add demo-document-indexing example to LLM app repo (#5627)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 16f2ba3cb780db206b30b47eb705e4e744ac0bcc
  • Loading branch information
zxqfd555-pw authored and Manul from Pathway committed Feb 7, 2024
1 parent b8fb490 commit 1da6e11
Show file tree
Hide file tree
Showing 14 changed files with 134 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
74 changes: 74 additions & 0 deletions examples/pipelines/demo-document-indexing/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os

from pathway.xpacks.llm.embedders import OpenAIEmbedder
from pathway.xpacks.llm.parsers import ParseUnstructured
from pathway.xpacks.llm.splitters import TokenCountSplitter
from pathway.xpacks.llm.vector_store import VectorStoreServer
from sources.gdrive import get_table as get_gdrive_table
from sources.local import get_table as get_local_table
from sources.sharepoint import get_table as get_sharepoint_table


def data_sources(source_types):
parsed_source_types = set([x.strip().lower() for x in source_types.split(",")])
sources = []
if "local" in parsed_source_types:
sources.append(get_local_table())
if "sharepoint" in parsed_source_types:
sources.append(get_sharepoint_table())
if "gdrive" in parsed_source_types:
sources.append(get_gdrive_table())
return sources


if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="Pathway realtime document indexing demo",
description="""
This is the demo of real-time indexing of the documents from various data sources.
It runs a simple web server that is capable of answering queries on the endpoints
/v1/retrieve, /v1/statistics, /v1/inputs. Please refer to the "Test the REST endpoint"
section at Hosted Pipelines website: https://cloud.pathway.com.
Currently, it supports several data sources: the local one, Google Drive, and Microsoft SharePoint.
For the demo, you need to store your Open AI key in the OPENAI_API_KEY environment variable.
""",
)
parser.add_argument(
"--host",
help="Host that will be used for running the web server",
default="0.0.0.0",
)
parser.add_argument(
"--port",
help="Port that will be used by the web server",
type=int,
default=21401,
)
parser.add_argument(
"--source-types",
help="Comma-separated source types to be used. "
"Possible options are local, gdrive, sharepoint. If the local "
"source is chosen, it will read documents from the top level of "
"the 'files-for-indexing/' folder",
default="local",
)
args = parser.parse_args()

splitter = TokenCountSplitter(max_tokens=1000)
embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"])
parser = ParseUnstructured()
vs_server = VectorStoreServer(
*data_sources(args.source_types),
embedder=embedder,
splitter=splitter,
parser=parser,
)
vs_server.run_server(
host=args.host,
port=args.port,
threaded=False,
with_cache=True,
)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FILE_SIZE_LIMIT = 4 * 1024 * 1024 # don't process files larger than 4 MB
20 changes: 20 additions & 0 deletions examples/pipelines/demo-document-indexing/sources/gdrive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from os.path import dirname, realpath

import pathway.io.gdrive as gdrive_connector

from .common import FILE_SIZE_LIMIT

GDRIVE_CONFIG = {
"object_id": "1cULDv2OaViJBmOfG5WB0oWcgayNrGtVs",
"service_user_credentials_file": f"{realpath(dirname(__file__))}/secrets/gdrive/gdrive_indexer.json",
"refresh_interval": 5,
}


def get_table():
gdrive_table = gdrive_connector.read(
**GDRIVE_CONFIG,
with_metadata=True,
object_size_limit=FILE_SIZE_LIMIT,
)
return gdrive_table
12 changes: 12 additions & 0 deletions examples/pipelines/demo-document-indexing/sources/local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pathway.io.fs as filesystem_connector

FILESYSTEM_CONFIG = {
"path": "files-for-indexing/",
"format": "binary",
"with_metadata": True,
}


def get_table():
filesystem_table = filesystem_connector.read(**FILESYSTEM_CONFIG)
return filesystem_table
27 changes: 27 additions & 0 deletions examples/pipelines/demo-document-indexing/sources/sharepoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
from os.path import dirname, realpath

from pathway.xpacks.connectors import sharepoint as sharepoint_connector

from .common import FILE_SIZE_LIMIT

SHAREPOINT_MUTABLE_COLLECTION_PATH = "Shared Documents/IndexerSandbox"

SHAREPOINT_SITE_CONFIG = {
"url": "https://navalgo.sharepoint.com/sites/ConnectorSandbox",
"tenant": os.environ["SHAREPOINT_TENANT"],
"client_id": os.environ["SHAREPOINT_CLIENT_ID"],
"cert_path": f"{realpath(dirname(__file__))}/secrets/sharepoint/sharepointcert.pem",
"thumbprint": os.environ["SHAREPOINT_THUMBPRINT"],
"refresh_interval": 5,
}


def get_table():
sharepoint_table = sharepoint_connector.read(
**SHAREPOINT_SITE_CONFIG,
root_path=SHAREPOINT_MUTABLE_COLLECTION_PATH,
with_metadata=True,
object_size_limit=FILE_SIZE_LIMIT,
)
return sharepoint_table

0 comments on commit 1da6e11

Please sign in to comment.