-
Notifications
You must be signed in to change notification settings - Fork 236
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add demo-document-indexing example to LLM app repo (#5627)
GitOrigin-RevId: 16f2ba3cb780db206b30b47eb705e4e744ac0bcc
- Loading branch information
1 parent
b8fb490
commit 1da6e11
Showing
14 changed files
with
134 additions
and
0 deletions.
There are no files selected for viewing
Binary file added
BIN
+682 KB
...indexing/files-for-indexing/Always up-to-date Vector Data Indexing pipeline _ Pathway.pdf
Binary file not shown.
Binary file added
BIN
+1 MB
examples/pipelines/demo-document-indexing/files-for-indexing/Build an LLM App _ Pathway.pdf
Binary file not shown.
Binary file added
BIN
+767 KB
...es/pipelines/demo-document-indexing/files-for-indexing/Launching Pathway + LlamaIndex.pdf
Binary file not shown.
Binary file added
BIN
+670 KB
...ing/files-for-indexing/Realtime Classification with Nearest Neighbors (1_2) _ Pathway.pdf
Binary file not shown.
Binary file added
BIN
+859 KB
...nes/demo-document-indexing/files-for-indexing/Realtime Twitter Analysis App _ Pathway.pdf
Binary file not shown.
Binary file added
BIN
+1000 KB
...mo-document-indexing/files-for-indexing/Use LLMs to Ingest Raw Text into DB _ Pathway.pdf
Binary file not shown.
Binary file added
BIN
+1.48 MB
examples/pipelines/demo-document-indexing/files-for-indexing/arxiv 2307.13116.pdf
Binary file not shown.
Binary file added
BIN
+635 KB
examples/pipelines/demo-document-indexing/files-for-indexing/pw.io.http _ Pathway.pdf
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import argparse | ||
import os | ||
|
||
from pathway.xpacks.llm.embedders import OpenAIEmbedder | ||
from pathway.xpacks.llm.parsers import ParseUnstructured | ||
from pathway.xpacks.llm.splitters import TokenCountSplitter | ||
from pathway.xpacks.llm.vector_store import VectorStoreServer | ||
from sources.gdrive import get_table as get_gdrive_table | ||
from sources.local import get_table as get_local_table | ||
from sources.sharepoint import get_table as get_sharepoint_table | ||
|
||
|
||
def data_sources(source_types): | ||
parsed_source_types = set([x.strip().lower() for x in source_types.split(",")]) | ||
sources = [] | ||
if "local" in parsed_source_types: | ||
sources.append(get_local_table()) | ||
if "sharepoint" in parsed_source_types: | ||
sources.append(get_sharepoint_table()) | ||
if "gdrive" in parsed_source_types: | ||
sources.append(get_gdrive_table()) | ||
return sources | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
prog="Pathway realtime document indexing demo", | ||
description=""" | ||
This is the demo of real-time indexing of the documents from various data sources. | ||
It runs a simple web server that is capable of answering queries on the endpoints | ||
/v1/retrieve, /v1/statistics, /v1/inputs. Please refer to the "Test the REST endpoint" | ||
section at Hosted Pipelines website: https://cloud.pathway.com. | ||
Currently, it supports several data sources: the local one, Google Drive, and Microsoft SharePoint. | ||
For the demo, you need to store your Open AI key in the OPENAI_API_KEY environment variable. | ||
""", | ||
) | ||
parser.add_argument( | ||
"--host", | ||
help="Host that will be used for running the web server", | ||
default="0.0.0.0", | ||
) | ||
parser.add_argument( | ||
"--port", | ||
help="Port that will be used by the web server", | ||
type=int, | ||
default=21401, | ||
) | ||
parser.add_argument( | ||
"--source-types", | ||
help="Comma-separated source types to be used. " | ||
"Possible options are local, gdrive, sharepoint. If the local " | ||
"source is chosen, it will read documents from the top level of " | ||
"the 'files-for-indexing/' folder", | ||
default="local", | ||
) | ||
args = parser.parse_args() | ||
|
||
splitter = TokenCountSplitter(max_tokens=1000) | ||
embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]) | ||
parser = ParseUnstructured() | ||
vs_server = VectorStoreServer( | ||
*data_sources(args.source_types), | ||
embedder=embedder, | ||
splitter=splitter, | ||
parser=parser, | ||
) | ||
vs_server.run_server( | ||
host=args.host, | ||
port=args.port, | ||
threaded=False, | ||
with_cache=True, | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
FILE_SIZE_LIMIT = 4 * 1024 * 1024 # don't process files larger than 4 MB |
20 changes: 20 additions & 0 deletions
20
examples/pipelines/demo-document-indexing/sources/gdrive.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from os.path import dirname, realpath | ||
|
||
import pathway.io.gdrive as gdrive_connector | ||
|
||
from .common import FILE_SIZE_LIMIT | ||
|
||
GDRIVE_CONFIG = { | ||
"object_id": "1cULDv2OaViJBmOfG5WB0oWcgayNrGtVs", | ||
"service_user_credentials_file": f"{realpath(dirname(__file__))}/secrets/gdrive/gdrive_indexer.json", | ||
"refresh_interval": 5, | ||
} | ||
|
||
|
||
def get_table(): | ||
gdrive_table = gdrive_connector.read( | ||
**GDRIVE_CONFIG, | ||
with_metadata=True, | ||
object_size_limit=FILE_SIZE_LIMIT, | ||
) | ||
return gdrive_table |
12 changes: 12 additions & 0 deletions
12
examples/pipelines/demo-document-indexing/sources/local.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pathway.io.fs as filesystem_connector | ||
|
||
FILESYSTEM_CONFIG = { | ||
"path": "files-for-indexing/", | ||
"format": "binary", | ||
"with_metadata": True, | ||
} | ||
|
||
|
||
def get_table(): | ||
filesystem_table = filesystem_connector.read(**FILESYSTEM_CONFIG) | ||
return filesystem_table |
27 changes: 27 additions & 0 deletions
27
examples/pipelines/demo-document-indexing/sources/sharepoint.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import os | ||
from os.path import dirname, realpath | ||
|
||
from pathway.xpacks.connectors import sharepoint as sharepoint_connector | ||
|
||
from .common import FILE_SIZE_LIMIT | ||
|
||
SHAREPOINT_MUTABLE_COLLECTION_PATH = "Shared Documents/IndexerSandbox" | ||
|
||
SHAREPOINT_SITE_CONFIG = { | ||
"url": "https://navalgo.sharepoint.com/sites/ConnectorSandbox", | ||
"tenant": os.environ["SHAREPOINT_TENANT"], | ||
"client_id": os.environ["SHAREPOINT_CLIENT_ID"], | ||
"cert_path": f"{realpath(dirname(__file__))}/secrets/sharepoint/sharepointcert.pem", | ||
"thumbprint": os.environ["SHAREPOINT_THUMBPRINT"], | ||
"refresh_interval": 5, | ||
} | ||
|
||
|
||
def get_table(): | ||
sharepoint_table = sharepoint_connector.read( | ||
**SHAREPOINT_SITE_CONFIG, | ||
root_path=SHAREPOINT_MUTABLE_COLLECTION_PATH, | ||
with_metadata=True, | ||
object_size_limit=FILE_SIZE_LIMIT, | ||
) | ||
return sharepoint_table |