diff --git a/framework/langchain/conversational_memory.ipynb b/framework/langchain/conversational_memory.ipynb new file mode 100644 index 00000000..da0b64bc --- /dev/null +++ b/framework/langchain/conversational_memory.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# CrateDB Chat Message History\n", + "\n", + "This notebook demonstrates how to use the `CrateDBChatMessageHistory`\n", + "to manage chat history in CrateDB, for supporting conversational memory." + ], + "metadata": { + "collapsed": false + }, + "id": "f22eab3f84cbeb37" + }, + { + "cell_type": "markdown", + "source": [ + "## Prerequisites\n", + "\n", + "Install required packages." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Configuration\n", + "\n", + "To use the storage wrapper, you will need to configure two details.\n", + "\n", + "1. Session Id - a unique identifier of the session, like user name, email, chat id etc.\n", + "2. Database connection string: An SQLAlchemy-compatible URI that specifies the database\n", + " connection. It will be passed to SQLAlchemy create_engine function." + ], + "metadata": { + "collapsed": false + }, + "id": "f8f2830ee9ca1e01" + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "from langchain.memory.chat_message_histories import CrateDBChatMessageHistory\n", + "\n", + "CONNECTION_STRING = \"crate://crate@localhost/?schema=notebook\"\n", + "\n", + "chat_message_history = CrateDBChatMessageHistory(\n", + "\tsession_id=\"test_session\",\n", + "\tconnection_string=CONNECTION_STRING\n", + ")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Basic Usage" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "chat_message_history.add_user_message(\"Hello\")\n", + "chat_message_history.add_ai_message(\"Hi\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-08-28T10:04:38.077748Z", + "start_time": "2023-08-28T10:04:36.105894Z" + } + }, + "id": "4576e914a866fb40" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "[HumanMessage(content='Hello', additional_kwargs={}, example=False),\n AIMessage(content='Hi', additional_kwargs={}, example=False)]" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chat_message_history.messages" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-08-28T10:04:38.929396Z", + "start_time": "2023-08-28T10:04:38.915727Z" + } + }, + "id": "b476688cbb32ba90" + }, + { + "cell_type": "markdown", + "source": [ + "## Custom Storage Model\n", + "\n", + "The default data model, which stores information about conversation messages only\n", + "has two slots for storing message details, the session id, and the message dictionary.\n", + "\n", + "If you want to store additional information, like message date, author, language etc.,\n", + "please provide an implementation for a custom message converter.\n", + "\n", + "This example demonstrates how to create a custom message converter, by implementing\n", + "the `BaseMessageConverter` interface." + ], + "metadata": { + "collapsed": false + }, + "id": "2e5337719d5614fd" + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "from typing import Any\n", + "\n", + "from langchain.memory.chat_message_histories.cratedb import generate_autoincrement_identifier\n", + "from langchain.memory.chat_message_histories.sql import BaseMessageConverter\n", + "from langchain.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage\n", + "\n", + "import sqlalchemy as sa\n", + "from sqlalchemy.orm import declarative_base\n", + "\n", + "\n", + "Base = declarative_base()\n", + "\n", + "\n", + "class CustomMessage(Base):\n", + "\t__tablename__ = \"custom_message_store\"\n", + "\n", + "\tid = sa.Column(sa.BigInteger, primary_key=True, default=generate_autoincrement_identifier)\n", + "\tsession_id = sa.Column(sa.Text)\n", + "\ttype = sa.Column(sa.Text)\n", + "\tcontent = sa.Column(sa.Text)\n", + "\tcreated_at = sa.Column(sa.DateTime)\n", + "\tauthor_email = sa.Column(sa.Text)\n", + "\n", + "\n", + "class CustomMessageConverter(BaseMessageConverter):\n", + "\tdef __init__(self, author_email: str):\n", + "\t\tself.author_email = author_email\n", + "\t\n", + "\tdef from_sql_model(self, sql_message: Any) -> BaseMessage:\n", + "\t\tif sql_message.type == \"human\":\n", + "\t\t\treturn HumanMessage(\n", + "\t\t\t\tcontent=sql_message.content,\n", + "\t\t\t)\n", + "\t\telif sql_message.type == \"ai\":\n", + "\t\t\treturn AIMessage(\n", + "\t\t\t\tcontent=sql_message.content,\n", + "\t\t\t)\n", + "\t\telif sql_message.type == \"system\":\n", + "\t\t\treturn SystemMessage(\n", + "\t\t\t\tcontent=sql_message.content,\n", + "\t\t\t)\n", + "\t\telse:\n", + "\t\t\traise ValueError(f\"Unknown message type: {sql_message.type}\")\n", + "\t\n", + "\tdef to_sql_model(self, message: BaseMessage, session_id: str) -> Any:\n", + "\t\tnow = datetime.now()\n", + "\t\treturn CustomMessage(\n", + "\t\t\tsession_id=session_id,\n", + "\t\t\ttype=message.type,\n", + "\t\t\tcontent=message.content,\n", + "\t\t\tcreated_at=now,\n", + "\t\t\tauthor_email=self.author_email\n", + "\t\t)\n", + "\t\n", + "\tdef get_sql_model_class(self) -> Any:\n", + "\t\treturn CustomMessage\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + "\n", + "\tBase.metadata.drop_all(bind=sa.create_engine(CONNECTION_STRING))\n", + "\n", + "\tchat_message_history = CrateDBChatMessageHistory(\n", + "\t\tsession_id=\"test_session\",\n", + "\t\tconnection_string=CONNECTION_STRING,\n", + "\t\tcustom_message_converter=CustomMessageConverter(\n", + "\t\t\tauthor_email=\"test@example.com\"\n", + "\t\t)\n", + "\t)\n", + "\n", + "\tchat_message_history.add_user_message(\"Hello\")\n", + "\tchat_message_history.add_ai_message(\"Hi\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-08-28T10:04:41.510498Z", + "start_time": "2023-08-28T10:04:41.494912Z" + } + }, + "id": "fdfde84c07d071bb" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "data": { + "text/plain": "[HumanMessage(content='Hello', additional_kwargs={}, example=False),\n AIMessage(content='Hi', additional_kwargs={}, example=False)]" + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chat_message_history.messages" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-08-28T10:04:43.497990Z", + "start_time": "2023-08-28T10:04:43.492517Z" + } + }, + "id": "4a6a54d8a9e2856f" + }, + { + "cell_type": "markdown", + "source": [ + "## Custom Name for Session Column\n", + "\n", + "The session id, a unique token identifying the session, is an important property of\n", + "this subsystem. If your database table stores it in a different column, you can use\n", + "the `session_id_field_name` keyword argument to adjust the name correspondingly." + ], + "metadata": { + "collapsed": false + }, + "id": "622aded629a1adeb" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "import json\n", + "import typing as t\n", + "\n", + "from langchain.memory.chat_message_histories.cratedb import generate_autoincrement_identifier, CrateDBMessageConverter\n", + "from langchain.schema import _message_to_dict\n", + "\n", + "\n", + "Base = declarative_base()\n", + "\n", + "class MessageWithDifferentSessionIdColumn(Base):\n", + "\t__tablename__ = \"message_store_different_session_id\"\n", + "\tid = sa.Column(sa.BigInteger, primary_key=True, default=generate_autoincrement_identifier)\n", + "\tcustom_session_id = sa.Column(sa.Text)\n", + "\tmessage = sa.Column(sa.Text)\n", + "\n", + "\n", + "class CustomMessageConverterWithDifferentSessionIdColumn(CrateDBMessageConverter):\n", + " def __init__(self):\n", + " self.model_class = MessageWithDifferentSessionIdColumn\n", + "\n", + " def to_sql_model(self, message: BaseMessage, custom_session_id: str) -> t.Any:\n", + " return self.model_class(\n", + " custom_session_id=custom_session_id, message=json.dumps(_message_to_dict(message))\n", + " )\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + "\tBase.metadata.drop_all(bind=sa.create_engine(CONNECTION_STRING))\n", + "\n", + "\tchat_message_history = CrateDBChatMessageHistory(\n", + "\t\tsession_id=\"test_session\",\n", + "\t\tconnection_string=CONNECTION_STRING,\n", + "\t\tcustom_message_converter=CustomMessageConverterWithDifferentSessionIdColumn(),\n", + "\t\tsession_id_field_name=\"custom_session_id\",\n", + "\t)\n", + "\n", + "\tchat_message_history.add_user_message(\"Hello\")\n", + "\tchat_message_history.add_ai_message(\"Hi\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "[HumanMessage(content='Hello', additional_kwargs={}, example=False),\n AIMessage(content='Hi', additional_kwargs={}, example=False)]" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chat_message_history.messages" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/framework/langchain/conversational_memory.py b/framework/langchain/conversational_memory.py new file mode 100644 index 00000000..74934e40 --- /dev/null +++ b/framework/langchain/conversational_memory.py @@ -0,0 +1,34 @@ +""" +Demonstrate conversational memory with CrateDB. + +Synopsis:: + + # Install prerequisites. + pip install -r requirements.txt + + # Start database. + docker run --rm -it --publish=4200:4200 crate/crate:nightly + + # Run program. + export CRATEDB_CONNECTION_STRING="crate://crate@localhost/?schema=doc" + python conversational_memory.py +""" +import os +from pprint import pprint + +from langchain.memory.chat_message_histories import CrateDBChatMessageHistory + + +def main(): + + chat_message_history = CrateDBChatMessageHistory( + session_id="test_session", + connection_string=os.environ.get("CRATEDB_CONNECTION_STRING") + ) + chat_message_history.add_user_message("Hello") + chat_message_history.add_ai_message("Hi") + pprint(chat_message_history.messages) + + +if __name__ == "__main__": + main() diff --git a/framework/langchain/document_loader.ipynb b/framework/langchain/document_loader.ipynb new file mode 100644 index 00000000..b8756382 --- /dev/null +++ b/framework/langchain/document_loader.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SQLAlchemy\n", + "\n", + "This notebook demonstrates how to load documents from a [CrateDB] database,\n", + "using the document loader `CrateDBLoader`, which is based on [SQLAlchemy].\n", + "\n", + "It loads the result of a database query with one document per row.\n", + "\n", + "[CrateDB]: https://github.com/crate/crate\n", + "[SQLAlchemy]: https://www.sqlalchemy.org/" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Prerequisites\n", + "\n", + "Install required packages." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Populate database." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[32mCONNECT OK\r\n", + "\u001B[0m\u001B[32mPSQL OK, 1 row affected (0.001 sec)\r\n", + "\u001B[0m\u001B[32mDELETE OK, 30 rows affected (0.010 sec)\r\n", + "\u001B[0m\u001B[32mINSERT OK, 30 rows affected (0.011 sec)\r\n", + "\u001B[0m\u001B[0m\u001B[32mCONNECT OK\r\n", + "\u001B[0m\u001B[32mREFRESH OK, 1 row affected (0.026 sec)\r\n", + "\u001B[0m\u001B[0m" + ] + } + ], + "source": [ + "!rm -f mlb_teams_2012.sql\n", + "!wget --quiet https://github.com/crate-workbench/langchain/raw/cratedb/docs/docs/integrations/document_loaders/example_data/mlb_teams_2012.sql\n", + "\n", + "!crash --schema=notebook < mlb_teams_2012.sql;\n", + "!crash --schema=notebook --command \"REFRESH TABLE mlb_teams_2012;\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Usage" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import CrateDBLoader\n", + "from pprint import pprint\n", + "\n", + "CONNECTION_STRING = \"crate://crate@localhost/?schema=notebook\"\n", + "\n", + "loader = CrateDBLoader(\n", + " 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n", + " url=CONNECTION_STRING,\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Angels\\nPayroll (millions): 154.49\\nWins: 89', metadata={}),\n", + " Document(page_content='Team: Astros\\nPayroll (millions): 60.65\\nWins: 55', metadata={}),\n", + " Document(page_content='Team: Athletics\\nPayroll (millions): 55.37\\nWins: 94', metadata={}),\n", + " Document(page_content='Team: Blue Jays\\nPayroll (millions): 75.48\\nWins: 73', metadata={}),\n", + " Document(page_content='Team: Braves\\nPayroll (millions): 83.31\\nWins: 94', metadata={})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specifying Which Columns are Content vs Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "loader = CrateDBLoader(\n", + " 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n", + " url=CONNECTION_STRING,\n", + " page_content_columns=[\"Team\"],\n", + " metadata_columns=[\"Payroll (millions)\"],\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Angels', metadata={'Payroll (millions)': 154.49}),\n", + " Document(page_content='Team: Astros', metadata={'Payroll (millions)': 60.65}),\n", + " Document(page_content='Team: Athletics', metadata={'Payroll (millions)': 55.37}),\n", + " Document(page_content='Team: Blue Jays', metadata={'Payroll (millions)': 75.48}),\n", + " Document(page_content='Team: Braves', metadata={'Payroll (millions)': 83.31})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding Source to Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "loader = CrateDBLoader(\n", + " 'SELECT * FROM mlb_teams_2012 ORDER BY \"Team\" LIMIT 5;',\n", + " url=CONNECTION_STRING,\n", + " source_columns=[\"Team\"],\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Angels\\nPayroll (millions): 154.49\\nWins: 89', metadata={'source': 'Angels'}),\n", + " Document(page_content='Team: Astros\\nPayroll (millions): 60.65\\nWins: 55', metadata={'source': 'Astros'}),\n", + " Document(page_content='Team: Athletics\\nPayroll (millions): 55.37\\nWins: 94', metadata={'source': 'Athletics'}),\n", + " Document(page_content='Team: Blue Jays\\nPayroll (millions): 75.48\\nWins: 73', metadata={'source': 'Blue Jays'}),\n", + " Document(page_content='Team: Braves\\nPayroll (millions): 83.31\\nWins: 94', metadata={'source': 'Braves'})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/framework/langchain/document_loader.py b/framework/langchain/document_loader.py new file mode 100644 index 00000000..e6710af2 --- /dev/null +++ b/framework/langchain/document_loader.py @@ -0,0 +1,42 @@ +""" +Exercise the LangChain/CrateDB document loader. + +How to use the SQL document loader, based on SQLAlchemy. + +The example uses the canonical `mlb_teams_2012.csv`, +converted to SQL, see `mlb_teams_2012.sql`. + +Synopsis:: + + # Install prerequisites. + pip install -r requirements.txt + + # Start database. + docker run --rm -it --publish=4200:4200 crate/crate:nightly + + # Provide input data: Acquire SQL file and populate database. + wget https://github.com/crate-workbench/langchain/raw/cratedb/docs/docs/integrations/document_loaders/example_data/mlb_teams_2012.sql + crash < mlb_teams_2012.sql + + # Run program. + export CRATEDB_CONNECTION_STRING="crate://crate@localhost/?schema=doc" + python document_loader.py +""" +import os + +from langchain.document_loaders import CrateDBLoader +from pprint import pprint + + +def main(): + loader = CrateDBLoader( + query="SELECT * FROM mlb_teams_2012 LIMIT 3;", + url=os.environ.get("CRATEDB_CONNECTION_STRING"), + include_rownum_into_metadata=True, + ) + docs = loader.load() + pprint(docs) + + +if __name__ == "__main__": + main() diff --git a/framework/langchain/readme.md b/framework/langchain/readme.md new file mode 100644 index 00000000..fb6d52d4 --- /dev/null +++ b/framework/langchain/readme.md @@ -0,0 +1,115 @@ +# LangChain and CrateDB + + +## About LangChain + +[LangChain] is an open source framework for developing applications powered +by language models. It provides a complete set of powerful and flexible +components for building context-aware, reasoning applications. + +Please refer to the [LangChain documentation] for further information. + +Common end-to-end use cases are: + +- Analyzing structured data +- Chatbots and friends +- Document question answering + +LangChain provides standard, extendable interfaces and external integrations +for the following modules, listed from least to most complex: + +- [Model I/O][Model I/O]: Interface with language models +- [Retrieval][Retrieval]: Interface with application-specific data +- [Chains][Chains]: Construct sequences of calls +- [Agents][Agents]: Let chains choose which tools to use given high-level directives +- [Memory][Memory]: Persist application state between runs of a chain +- [Callbacks][Callbacks]: Log and stream intermediate steps of any chain + + +## What's inside + +[![Made with Jupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?logo=Jupyter)](https://jupyter.org/try) [![Made with Markdown](https://img.shields.io/badge/Made%20with-Markdown-1f425f.svg?logo=Markdown)](https://commonmark.org) + +This folder provides guidelines and runnable code to get started with [LangChain] +and [CrateDB]. + +- [readme.md](readme.md): The file you are currently reading contains a walkthrough + about how to get started with the LangChain framework and CrateDB, and guides you + to corresponding example programs how to use different subsystems. + +- [requirements.txt](requirements.txt): Pulls in a patched version of LangChain, + as well as the CrateDB client driver and the `crash` command-line interface. + +- `vector_store.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](vector_search.ipynb) [![Launch Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/crate/cratedb-examples/amo/framework-langchain?labpath=framework%2Flangchain%2Fvector_search.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/amo%2Fframework-langchain/framework/langchain/vector_search.ipynb) + + This notebook explores CrateDB's [`FLOAT_VECTOR`] and [`KNN_MATCH`] functionalities for storing and retrieving + embeddings, and for conducting similarity searches. + +- `document_loader.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](document_loader.ipynb) [![Launch Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/crate/cratedb-examples/amo/framework-langchain?labpath=framework%2Flangchain%2Fdocument_loader.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/amo%2Fframework-langchain/framework/langchain/document_loader.ipynb) + + The notebook about the Document Loader demonstrates how to query a database table in CrateDB and use it as a + source provider for LangChain documents. + +- `conversational_memory.ipynb` [![Open on GitHub](https://img.shields.io/badge/Open%20on-GitHub-lightgray?logo=GitHub)](conversational_memory.ipynb) [![Launch Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/crate/cratedb-examples/amo/framework-langchain?labpath=framework%2Flangchain%2Fconversational_memory.ipynb) [![Open in Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crate/cratedb-examples/blob/amo%2Fframework-langchain/framework/langchain/conversational_memory.ipynb) + + LangChain also supports managing conversation history in SQL databases. This notebook exercises + how that works with CrateDB. + +- Accompanied to the Jupyter Notebook files, there are also basic variants of + corresponding examples, [vector_search.py](vector_search.py), + [document_loader.py](document_loader.py), and + [conversational_memory.py](conversational_memory.py). + + +## Install + +In order to properly set up a sandbox environment to explore the example notebooks +and programs, it is advised to create a Python virtualenv, and install the +dependencies into it. In this way, it is easy to wipe your virtualenv and start +from scratch anytime. + +```shell +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + + +## Setup + +The upcoming commands expect that you are working on a terminal with +activated virtualenv. +```shell +source .venv/bin/activate +``` + +### CrateDB on localhost + +In order to spin up a CrateDB instance without further ado, you can use +Docker or Podman. +```shell +docker run --rm -it \ + --name=cratedb --publish=4200:4200 --publish=5432:5432 \ + --env=CRATE_HEAP_SIZE=4g crate -Cdiscovery.type=single-node +``` + +### CrateDB Cloud + +Sign up or log in to [CrateDB Cloud], and create a free tier cluster. Within just a few minutes, +a cloud-based development environment is up and running. As soon as your project scales, you can +easily move to a different cluster tier or scale horizontally. + + + +[Agents]: https://python.langchain.com/docs/modules/agents/ +[Callbacks]: https://python.langchain.com/docs/modules/callbacks/ +[Chains]: https://python.langchain.com/docs/modules/chains/ +[CrateDB]: https://github.com/crate/crate +[CrateDB Cloud]: https://console.cratedb.cloud +[`FLOAT_VECTOR`]: https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector +[`KNN_MATCH`]: https://crate.io/docs/crate/reference/en/master/general/builtins/scalar-functions.html#scalar-knn-match +[LangChain]: https://www.langchain.com/ +[LangChain documentation]: https://python.langchain.com/ +[Memory]: https://python.langchain.com/docs/modules/memory/ +[Model I/O]: https://python.langchain.com/docs/modules/model_io/ +[Retrieval]: https://python.langchain.com/docs/modules/data_connection/ diff --git a/framework/langchain/requirements.txt b/framework/langchain/requirements.txt new file mode 100644 index 00000000..93a566eb --- /dev/null +++ b/framework/langchain/requirements.txt @@ -0,0 +1,5 @@ +crash +crate[sqlalchemy] +git+https://github.com/crate-workbench/langchain@cratedb#egg=langchain[cratedb,openai]&subdirectory=libs/langchain +python-dotenv +unstructured diff --git a/framework/langchain/vector_search.ipynb b/framework/langchain/vector_search.ipynb new file mode 100644 index 00000000..2b2353f6 --- /dev/null +++ b/framework/langchain/vector_search.ipynb @@ -0,0 +1,485 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CrateDB\n", + "\n", + "This notebook shows how to use the CrateDB vector store functionality around\n", + "[`FLOAT_VECTOR`] and [`KNN_MATCH`]. You will learn how to use it for similarity\n", + "search and other purposes.\n", + "\n", + "It supports:\n", + "- Similarity Search with Euclidean Distance\n", + "- Maximal Marginal Relevance Search (MMR)\n", + "\n", + "## What is CrateDB?\n", + "\n", + "[CrateDB] is an open-source, distributed, and scalable SQL analytics database\n", + "for storing and analyzing massive amounts of data in near real-time, even with\n", + "complex queries. It is PostgreSQL-compatible, based on [Lucene], and inherits\n", + "the shared-nothing distribution layer of [Elasticsearch].\n", + "\n", + "This example uses the [Python client driver for CrateDB].\n", + "\n", + "\n", + "[CrateDB]: https://github.com/crate/crate\n", + "[Elasticsearch]: https://github.com/elastic/elasticsearch\n", + "[`FLOAT_VECTOR`]: https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector\n", + "[`KNN_MATCH`]: https://crate.io/docs/crate/reference/en/master/general/builtins/scalar-functions.html#scalar-knn-match\n", + "[Lucene]: https://github.com/apache/lucene\n", + "[Python client driver for CrateDB]: https://crate.io/docs/python/" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Getting Started\n", + "\n", + "Install required Python packages." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [], + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "#!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You need to provide an OpenAI API key, optionally using the environment\n", + "variable `OPENAI_API_KEY`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:02:16.802456Z", + "start_time": "2023-09-09T08:02:07.065604Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "from dotenv import load_dotenv, find_dotenv\n", + "\n", + "# Run `export OPENAI_API_KEY=sk-YOUR_OPENAI_API_KEY`.\n", + "# Get OpenAI api key from `.env` file.\n", + "# Otherwise, prompt for it.\n", + "_ = load_dotenv(find_dotenv())\n", + "OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', getpass.getpass(\"OpenAI API key:\"))\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "markdown", + "source": [ + "You also need to provide a connection string to your CrateDB database cluster,\n", + "optionally using the environment variable `CRATEDB_CONNECTION_STRING`.\n", + "\n", + "This example uses a CrateDB instance on your workstation, which you can start by\n", + "running [CrateDB using Docker]. Alternatively, you can also connect to a cluster\n", + "running on [CrateDB Cloud].\n", + "\n", + "[CrateDB Cloud]: https://console.cratedb.cloud/\n", + "[CrateDB using Docker]: https://crate.io/docs/crate/tutorials/en/latest/basic/index.html#docker" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "import os\n", + "\n", + "CONNECTION_STRING = os.environ.get(\n", + " \"CRATEDB_CONNECTION_STRING\",\n", + " \"crate://crate@localhost/?schema=notebook\",\n", + ")\n", + "\n", + "# For CrateDB Cloud, use:\n", + "# CONNECTION_STRING = os.environ.get(\n", + "# \"CRATEDB_CONNECTION_STRING\",\n", + "# \"crate://username:password@hostname/?ssl=true&schema=notebook\",\n", + "# )" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:02:28.174088Z", + "start_time": "2023-09-09T08:02:28.162698Z" + } + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "# Alternatively, the connection string can be assembled from individual\n", + "# environment variables.\n", + "import os\n", + "\n", + "CONNECTION_STRING = CrateDBVectorSearch.connection_string_from_db_params(\n", + " driver=os.environ.get(\"CRATEDB_DRIVER\", \"crate\"),\n", + " host=os.environ.get(\"CRATEDB_HOST\", \"localhost\"),\n", + " port=int(os.environ.get(\"CRATEDB_PORT\", \"4200\")),\n", + " database=os.environ.get(\"CRATEDB_DATABASE\", \"langchain\"),\n", + " user=os.environ.get(\"CRATEDB_USER\", \"crate\"),\n", + " password=os.environ.get(\"CRATEDB_PASSWORD\", \"\"),\n", + ")\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "source": [ + "You will start by importing all required modules." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import CrateDBVectorSearch\n", + "from langchain.document_loaders import UnstructuredURLLoader\n", + "from langchain.docstore.document import Document" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Next, read input data, and tokenize it." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "loader = UnstructuredURLLoader(\n", + " urls=[\"https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt\"],\n", + ")\n", + "documents = loader.load()\n", + "\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Similarity Search with Euclidean Distance (Default)\n", + "\n", + "The module will create a table with the name of the collection. Make sure\n", + "the collection name is unique and that you have the permission to create\n", + "a table." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:04:16.696625Z", + "start_time": "2023-09-09T08:02:31.817790Z" + }, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "COLLECTION_NAME = \"state_of_the_union_test\"\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "db = CrateDBVectorSearch.from_documents(\n", + " embedding=embeddings,\n", + " documents=docs,\n", + " collection_name=COLLECTION_NAME,\n", + " connection_string=CONNECTION_STRING,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:05:11.104135Z", + "start_time": "2023-09-09T08:05:10.548998Z" + } + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs_with_score = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:05:13.532334Z", + "start_time": "2023-09-09T08:05:13.523191Z" + } + }, + "outputs": [], + "source": [ + "for doc, score in docs_with_score:\n", + " print(\"-\" * 80)\n", + " print(\"Score: \", score)\n", + " print(doc.page_content)\n", + " print(\"-\" * 80)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Maximal Marginal Relevance Search (MMR)\n", + "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "docs_with_score = db.max_marginal_relevance_search_with_score(query)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-09T08:05:23.276819Z", + "start_time": "2023-09-09T08:05:21.972256Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "for doc, score in docs_with_score:\n", + " print(\"-\" * 80)\n", + " print(\"Score: \", score)\n", + " print(doc.page_content)\n", + " print(\"-\" * 80)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-09T08:05:27.478580Z", + "start_time": "2023-09-09T08:05:27.470138Z" + } + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with the vector store\n", + "\n", + "In the example above, you created a vector store from scratch. When\n", + "aiming to work with an existing vector store, you can initialize it directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "store = CrateDBVectorSearch(\n", + " collection_name=COLLECTION_NAME,\n", + " connection_string=CONNECTION_STRING,\n", + " embedding_function=embeddings,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add documents\n", + "\n", + "You can also add documents to an existing vector store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store.add_documents([Document(page_content=\"foo\")])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs_with_score = db.similarity_search_with_score(\"foo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs_with_score[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs_with_score[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Overwriting a vector store\n", + "\n", + "If you have an existing collection, you can overwrite it by using `from_documents`,\n", + "aad setting `pre_delete_collection = True`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db = CrateDBVectorSearch.from_documents(\n", + " documents=docs,\n", + " embedding=embeddings,\n", + " collection_name=COLLECTION_NAME,\n", + " connection_string=CONNECTION_STRING,\n", + " pre_delete_collection=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs_with_score = db.similarity_search_with_score(\"foo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs_with_score[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using a vector store as a retriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = store.as_retriever()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(retriever)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/framework/langchain/vector_search.py b/framework/langchain/vector_search.py new file mode 100644 index 00000000..8b71d11d --- /dev/null +++ b/framework/langchain/vector_search.py @@ -0,0 +1,48 @@ +""" +Use CrateDB Vector Search with OpenAI embeddings. + +The example uses the canonical `state_of_the_union.txt`. + +Synopsis:: + + # Install prerequisites. + pip install -r requirements.txt + + # Start database. + docker run --rm -it --publish=4200:4200 crate/crate:nightly + + # Provide input data: Acquire text file. + # The example uses the canonical `state_of_the_union.txt`. + wget https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt + + # Configure: Set environment variables. + # Correct OpenAI API key should be used. SQL connection string fits a local instance of CrateDB. + export OPENAI_API_KEY="" + export CRATEDB_CONNECTION_STRING="crate://crate@localhost/?schema=doc" + + # Run program. + python vector_search.py +""" # noqa: E501 +from langchain.document_loaders import TextLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import CrateDBVectorSearch + + +def main(): + + # Load the document, split it into chunks, embed each chunk, + # and load it into the vector store. + raw_documents = TextLoader("state_of_the_union.txt").load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + db = CrateDBVectorSearch.from_documents(documents, OpenAIEmbeddings()) + + # Invoke a query, and display the first result. + query = "What did the president say about Ketanji Brown Jackson" + docs = db.similarity_search(query) + print(docs[0].page_content) + + +if __name__ == "__main__": + main()