From c167ce10da489b71514be11907aa3f2e1d943d41 Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:24:57 -0700
Subject: [PATCH 1/7] Created using Colab
---
05_1_ChromaDB.ipynb | 436 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 436 insertions(+)
create mode 100644 05_1_ChromaDB.ipynb
diff --git a/05_1_ChromaDB.ipynb b/05_1_ChromaDB.ipynb
new file mode 100644
index 0000000..b73d11c
--- /dev/null
+++ b/05_1_ChromaDB.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bHQUCbh2aim1"
+ },
+ "source": [
+ "\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yRnbgQpzaim2"
+ },
+ "source": [
+ "# 05-1.ChromaDB\n",
+ "\n",
+ "## Overview \n",
+ "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n",
+ "\n",
+ "## Purpose of the Exercise\n",
+ "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lDjVZoaxaim3"
+ },
+ "source": [
+ "## Keyword VS Semantic Search\n",
+ "\n",
+ "\n",
+ "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FAFLy4Etaim3"
+ },
+ "source": [
+ "\n",
+ "\n",
+ "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "uPHGD_OBaim4"
+ },
+ "outputs": [],
+ "source": [
+ "! pip3 install -qU langchain-chroma markdownify langchain-upstage rank_bm25 python-dotenv langchain"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "KyP3R-L1aim4"
+ },
+ "outputs": [],
+ "source": [
+ "# @title set API key\n",
+ "import os\n",
+ "import getpass\n",
+ "from pprint import pprint\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "from IPython import get_ipython\n",
+ "\n",
+ "if \"google.colab\" in str(get_ipython()):\n",
+ " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
+ " from google.colab import userdata\n",
+ " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
+ "else:\n",
+ " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
+ " from dotenv import load_dotenv\n",
+ "\n",
+ " load_dotenv()\n",
+ "\n",
+ "if \"UPSTAGE_API_KEY\" not in os.environ:\n",
+ " os.environ[\"UPSTAGE_API_KEY\"] = getpass.getpass(\"Enter your Upstage API key: \")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "0DW5Q0Wkaim4",
+ "outputId": "3ca329f1-407d-405a-fb1a-74f387754e3d",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[Document(metadata={}, page_content='Korea is a beautiful country to visit in the spring.'), Document(metadata={}, page_content='The best time to visit Korea is in the fall.'), Document(metadata={}, page_content='Best way to find bug is using unit test.'), Document(metadata={}, page_content='Python is a great programming language for beginners.'), Document(metadata={}, page_content='Sung Kim is a great teacher.')]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "from langchain.docstore.document import Document\n",
+ "\n",
+ "from langchain_text_splitters import (\n",
+ " Language,\n",
+ " RecursiveCharacterTextSplitter,\n",
+ ")\n",
+ "\n",
+ "sample_text = [\n",
+ " \"Korea is a beautiful country to visit in the spring.\",\n",
+ " \"The best time to visit Korea is in the fall.\",\n",
+ " \"Best way to find bug is using unit test.\",\n",
+ " \"Python is a great programming language for beginners.\",\n",
+ " \"Sung Kim is a great teacher.\",\n",
+ "]\n",
+ "\n",
+ "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n",
+ "\n",
+ "print(splits)\n",
+ "\n",
+ "vectorstore = Chroma.from_documents(\n",
+ " documents=splits,\n",
+ " ids=[doc.page_content for doc in splits],\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "N_k9hEaUaim5"
+ },
+ "outputs": [],
+ "source": [
+ "# check if text is in the vector store\n",
+ "def is_in_vectorstore(vectorstore, text):\n",
+ " search_results = vectorstore.get(ids=[text])\n",
+ " if search_results and search_results[\"ids\"]:\n",
+ " return True\n",
+ " else:\n",
+ " return False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "9FlsMkYkaim5",
+ "outputId": "e91437c5-3c0e-427c-816b-4769361a20aa",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "is_in_vectorstore(vectorstore, \"Hello, new sentence\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "wxndeR9Naim6",
+ "outputId": "42148072-0dc8-4449-e33c-9d187f1b565a",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ],
+ "source": [
+ "is_in_vectorstore(vectorstore, splits[0].page_content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "bmaZFuDWaim6"
+ },
+ "outputs": [],
+ "source": [
+ "from langchain_upstage import UpstageDocumentParseLoader\n",
+ "\n",
+ "layzer = UpstageDocumentParseLoader(\"pdfs/kim-tse-2008.pdf\", output_format=\"html\")\n",
+ "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n",
+ "docs = layzer.load() # or layzer.lazy_load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "A7KuZZqHaim6",
+ "outputId": "4231f237-9132-489d-a6b1-5d601b07dfff",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Splits: 132\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_text_splitters import (\n",
+ " Language,\n",
+ " RecursiveCharacterTextSplitter,\n",
+ ")\n",
+ "\n",
+ "# 2. Split\n",
+ "text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+ " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n",
+ ")\n",
+ "splits = text_splitter.split_documents(docs)\n",
+ "print(\"Splits:\", len(splits))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "\n",
+ "vectorstore = Chroma(\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")\n",
+ "retriever = vectorstore.as_retriever()\n",
+ "\n",
+ "unique_splits = [\n",
+ " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
+ "]\n",
+ "print(len(unique_splits))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "lEqnFToCkZfH",
+ "outputId": "8d5d9b42-27ea-4221-a21d-8de74fa3d3d9"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "132\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "id": "X1SWtlmkaim6"
+ },
+ "outputs": [],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "from langchain_upstage import UpstageEmbeddings\n",
+ "from langchain.docstore.document import Document\n",
+ "\n",
+ "# Simplify metadata by converting complex data to simple types (str, int, float, bool)\n",
+ "def simplify_metadata(metadata):\n",
+ " simplified_metadata = {}\n",
+ " for key, value in metadata.items():\n",
+ " if isinstance(value, (str, int, float, bool)):\n",
+ " simplified_metadata[key] = value\n",
+ " else:\n",
+ " simplified_metadata[key] = str(value)\n",
+ " return simplified_metadata\n",
+ "\n",
+ "filtered_unique_splits = [\n",
+ " Document(page_content=split.page_content, metadata=simplify_metadata(split.metadata))\n",
+ " for split in unique_splits\n",
+ "]\n",
+ "\n",
+ "# 3. Embed & indexing\n",
+ "if len(filtered_unique_splits) > 0:\n",
+ " vectorstore = Chroma.from_documents(\n",
+ " ids=[split.page_content for split in filtered_unique_splits],\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " documents=filtered_unique_splits,\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "id": "3kxYmiUcaim7",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "2c26682e-77db-44cd-fffb-68d8700fbd12"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "0\n"
+ ]
+ }
+ ],
+ "source": [
+ "from langchain_chroma import Chroma\n",
+ "\n",
+ "vectorstore = Chroma(\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ ")\n",
+ "retriever = vectorstore.as_retriever()\n",
+ "\n",
+ "unique_splits = [\n",
+ " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
+ "]\n",
+ "print(len(unique_splits))\n",
+ "\n",
+ "# 3. Embed & indexing\n",
+ "if len(unique_splits) > 0:\n",
+ " vectorstore = Chroma.from_documents(\n",
+ " ids=[split.page_content for split in unique_splits],\n",
+ " persist_directory=\"./chroma_db\",\n",
+ " documents=unique_splits,\n",
+ " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "z5iN091Eaim7",
+ "outputId": "46de4889-b8bb-4371-f6cc-0add345013ba",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "
introduced bugs immediately. Several bug\n"
+ ]
+ }
+ ],
+ "source": [
+ "search_result = retriever.invoke(\"How to find problems in code?\")\n",
+ "print(search_result[0].page_content[:100])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
+ },
+ "colab": {
+ "provenance": [],
+ "include_colab_link": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
From 6c28fcb5f4d6c22447f8ea825ec367e7c4f6fa68 Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:25:13 -0700
Subject: [PATCH 2/7] Update 05_1_ChromaDB.ipynb
---
05_1_ChromaDB.ipynb | 12 +-----------
1 file changed, 1 insertion(+), 11 deletions(-)
diff --git a/05_1_ChromaDB.ipynb b/05_1_ChromaDB.ipynb
index b73d11c..f870983 100644
--- a/05_1_ChromaDB.ipynb
+++ b/05_1_ChromaDB.ipynb
@@ -1,15 +1,5 @@
{
"cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- ""
- ]
- },
{
"cell_type": "markdown",
"metadata": {
@@ -433,4 +423,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
From bc39bd432eebe4348dafee63fbab16d8b4d8b8bf Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:47:00 -0700
Subject: [PATCH 3/7] Created using Colab
---
05_1_ChromaDB.ipynb | 77 +++++++++++++++++++++++++++++----------------
1 file changed, 50 insertions(+), 27 deletions(-)
diff --git a/05_1_ChromaDB.ipynb b/05_1_ChromaDB.ipynb
index f870983..037b959 100644
--- a/05_1_ChromaDB.ipynb
+++ b/05_1_ChromaDB.ipynb
@@ -1,5 +1,15 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
@@ -63,7 +73,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 2,
"metadata": {
"id": "KyP3R-L1aim4"
},
@@ -95,10 +105,10 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 3,
"metadata": {
"id": "0DW5Q0Wkaim4",
- "outputId": "3ca329f1-407d-405a-fb1a-74f387754e3d",
+ "outputId": "df098096-582d-4f7e-a90c-affd61a68a77",
"colab": {
"base_uri": "https://localhost:8080/"
}
@@ -143,7 +153,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 4,
"metadata": {
"id": "N_k9hEaUaim5"
},
@@ -160,10 +170,10 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 5,
"metadata": {
"id": "9FlsMkYkaim5",
- "outputId": "e91437c5-3c0e-427c-816b-4769361a20aa",
+ "outputId": "8290c2c9-177c-4166-d7f3-368807ea2928",
"colab": {
"base_uri": "https://localhost:8080/"
}
@@ -177,7 +187,7 @@
]
},
"metadata": {},
- "execution_count": 9
+ "execution_count": 5
}
],
"source": [
@@ -186,10 +196,10 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 6,
"metadata": {
"id": "wxndeR9Naim6",
- "outputId": "42148072-0dc8-4449-e33c-9d187f1b565a",
+ "outputId": "92e464f3-bee5-4048-96be-b6247c99cc79",
"colab": {
"base_uri": "https://localhost:8080/"
}
@@ -203,7 +213,7 @@
]
},
"metadata": {},
- "execution_count": 10
+ "execution_count": 6
}
],
"source": [
@@ -212,7 +222,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 7,
"metadata": {
"id": "bmaZFuDWaim6"
},
@@ -227,10 +237,10 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 8,
"metadata": {
"id": "A7KuZZqHaim6",
- "outputId": "4231f237-9132-489d-a6b1-5d601b07dfff",
+ "outputId": "5c1bbf0a-1844-4ede-e682-eed0f4a3f2b2",
"colab": {
"base_uri": "https://localhost:8080/"
}
@@ -280,9 +290,9 @@
"base_uri": "https://localhost:8080/"
},
"id": "lEqnFToCkZfH",
- "outputId": "8d5d9b42-27ea-4221-a21d-8de74fa3d3d9"
+ "outputId": "a22c8c5e-7d16-4c5e-ca9f-cb739dd42847"
},
- "execution_count": 13,
+ "execution_count": 9,
"outputs": [
{
"output_type": "stream",
@@ -295,11 +305,23 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 10,
"metadata": {
- "id": "X1SWtlmkaim6"
+ "id": "X1SWtlmkaim6",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "06f4520e-ba67-4a13-a6d6-6222f67535a8"
},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "132\n"
+ ]
+ }
+ ],
"source": [
"from langchain_chroma import Chroma\n",
"from langchain_upstage import UpstageEmbeddings\n",
@@ -315,30 +337,31 @@
" simplified_metadata[key] = str(value)\n",
" return simplified_metadata\n",
"\n",
- "filtered_unique_splits = [\n",
+ "unique_splits = [\n",
" Document(page_content=split.page_content, metadata=simplify_metadata(split.metadata))\n",
" for split in unique_splits\n",
"]\n",
+ "print(len(unique_splits))\n",
"\n",
"# 3. Embed & indexing\n",
- "if len(filtered_unique_splits) > 0:\n",
+ "if len(unique_splits) > 0:\n",
" vectorstore = Chroma.from_documents(\n",
- " ids=[split.page_content for split in filtered_unique_splits],\n",
+ " ids=[split.page_content for split in unique_splits],\n",
" persist_directory=\"./chroma_db\",\n",
- " documents=filtered_unique_splits,\n",
+ " documents=unique_splits,\n",
" embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
" )\n"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 11,
"metadata": {
"id": "3kxYmiUcaim7",
"colab": {
"base_uri": "https://localhost:8080/"
},
- "outputId": "2c26682e-77db-44cd-fffb-68d8700fbd12"
+ "outputId": "a38c52fb-555c-4211-ffee-bee3e2c9d663"
},
"outputs": [
{
@@ -375,10 +398,10 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 12,
"metadata": {
"id": "z5iN091Eaim7",
- "outputId": "46de4889-b8bb-4371-f6cc-0add345013ba",
+ "outputId": "341a58d6-fdd4-4b72-8924-19fa6e74e5ed",
"colab": {
"base_uri": "https://localhost:8080/"
}
@@ -423,4 +446,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
+}
\ No newline at end of file
From 64d81c4e9e15299e39a3381d724e5cc91f3b3fef Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:47:54 -0700
Subject: [PATCH 4/7] Update 05_1_ChromaDB.ipynb
---
05_1_ChromaDB.ipynb | 12 +-----------
1 file changed, 1 insertion(+), 11 deletions(-)
diff --git a/05_1_ChromaDB.ipynb b/05_1_ChromaDB.ipynb
index 037b959..56aabd0 100644
--- a/05_1_ChromaDB.ipynb
+++ b/05_1_ChromaDB.ipynb
@@ -1,15 +1,5 @@
{
"cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- "
"
- ]
- },
{
"cell_type": "markdown",
"metadata": {
@@ -446,4 +436,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
\ No newline at end of file
+}
From c9feffcec34b0988cc6d40756aa64bb459461778 Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:49:50 -0700
Subject: [PATCH 5/7] Delete Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
---
Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb | 338 --------------------
1 file changed, 338 deletions(-)
delete mode 100644 Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
diff --git a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
deleted file mode 100644
index 31c3408..0000000
--- a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
+++ /dev/null
@@ -1,338 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "
\n",
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# 05-1.ChromaDB\n",
- "\n",
- "## Overview \n",
- "In this exercise, we will explore how to utilize ChromaDB to embed documents and construct a vectorspace. Additionally, we will gain insight into the creation of a Retriever object to facilitate efficient query searches within documents. This tutorial will guide you through the process of embedding documents and using a vectorspace for effective information retrieval.\n",
- " \n",
- "## Purpose of the Exercise\n",
- "The purpose of this exercise is to demonstrate the use of the Solar Embedding API to generate embeddings and create a vectorspace. By the end of this tutorial, users will be able to create a Retriever object and conduct efficient searches within the vectorspace, thereby enhancing the ability to retrieve relevant information from embedded documents.\n",
- "\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Keyword VS Semantic Search \n",
- "\n",
- "\n",
- "from https://blog.dataiku.com/semantic-search-an-overlooked-nlp-superpower"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "from https://sreent.medium.com/llms-embeddings-and-vector-search-d4bd9362df56"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "! pip3 install -qU markdownify langchain-upstage rank_bm25 python-dotenv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# @title set API key\n",
- "import os\n",
- "import getpass\n",
- "from pprint import pprint\n",
- "import warnings\n",
- "\n",
- "warnings.filterwarnings(\"ignore\")\n",
- "\n",
- "from IPython import get_ipython\n",
- "\n",
- "if \"google.colab\" in str(get_ipython()):\n",
- " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
- " from google.colab import userdata\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
- "else:\n",
- " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
- " from dotenv import load_dotenv\n",
- "\n",
- " load_dotenv()\n",
- "\n",
- "if \"UPSTAGE_API_KEY\" not in os.environ:\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = getpass.getpass(\"Enter your Upstage API key: \")\n"
-]
-
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[Document(page_content='Korea is a beautiful country to visit in the spring.'), Document(page_content='The best time to visit Korea is in the fall.'), Document(page_content='Best way to find bug is using unit test.'), Document(page_content='Python is a great programming language for beginners.'), Document(page_content='Sung Kim is a great teacher.')]\n"
- ]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "from langchain_upstage import UpstageEmbeddings\n",
- "from langchain.docstore.document import Document\n",
- "\n",
- "from langchain_text_splitters import (\n",
- " Language,\n",
- " RecursiveCharacterTextSplitter,\n",
- ")\n",
- "\n",
- "sample_text = [\n",
- " \"Korea is a beautiful country to visit in the spring.\",\n",
- " \"The best time to visit Korea is in the fall.\",\n",
- " \"Best way to find bug is using unit test.\",\n",
- " \"Python is a great programming language for beginners.\",\n",
- " \"Sung Kim is a great teacher.\",\n",
- "]\n",
- "\n",
- "splits = RecursiveCharacterTextSplitter().create_documents(sample_text)\n",
- "\n",
- "print(splits)\n",
- "\n",
- "vectorstore = Chroma.from_documents(\n",
- " documents=splits,\n",
- " ids=[doc.page_content for doc in splits],\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# check if text is in the vector store\n",
- "def is_in_vectorstore(vectorstore, text):\n",
- " search_results = vectorstore.get(ids=[text])\n",
- " if search_results and search_results[\"ids\"]:\n",
- " return True\n",
- " else:\n",
- " return False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "False"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "is_in_vectorstore(vectorstore, \"Hello, new sentence\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "is_in_vectorstore(vectorstore, splits[0].page_content)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain_upstage import UpstageLayoutAnalysisLoader\n",
- "\n",
- "\n",
- "layzer = UpstageLayoutAnalysisLoader(\"pdfs/kim-tse-2008.pdf\", output_type=\"html\")\n",
- "# For improved memory efficiency, consider using the lazy_load method to load documents page by page.\n",
- "docs = layzer.load() # or layzer.lazy_load()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Splits: 125\n"
- ]
- }
- ],
- "source": [
- "from langchain_text_splitters import (\n",
- " Language,\n",
- " RecursiveCharacterTextSplitter,\n",
- ")\n",
- "\n",
- "# 2. Split\n",
- "text_splitter = RecursiveCharacterTextSplitter.from_language(\n",
- " chunk_size=1000, chunk_overlap=100, language=Language.HTML\n",
- ")\n",
- "splits = text_splitter.split_documents(docs)\n",
- "print(\"Splits:\", len(splits))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "125\n"
- ]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "\n",
- "vectorstore = Chroma(\n",
- " persist_directory=\"./chroma_db\",\n",
- " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")\n",
- "retriever = vectorstore.as_retriever()\n",
- "\n",
- "\n",
- "unique_splits = [\n",
- " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
- "]\n",
- "print(len(unique_splits))\n",
- "\n",
- "# 3. Embed & indexing\n",
- "if len(unique_splits) > 0:\n",
- " vectorstore = Chroma.from_documents(\n",
- " ids=[split.page_content for split in unique_splits],\n",
- " persist_directory=\"./chroma_db\",\n",
- " documents=unique_splits,\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0\n"
- ]
- }
- ],
- "source": [
- "from langchain_chroma import Chroma\n",
- "\n",
- "vectorstore = Chroma(\n",
- " persist_directory=\"./chroma_db\",\n",
- " embedding_function=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- ")\n",
- "retriever = vectorstore.as_retriever()\n",
- "\n",
- "unique_splits = [\n",
- " split for split in splits if not is_in_vectorstore(vectorstore, split.page_content)\n",
- "]\n",
- "print(len(unique_splits))\n",
- "\n",
- "# 3. Embed & indexing\n",
- "if len(unique_splits) > 0:\n",
- " vectorstore = Chroma.from_documents(\n",
- " ids=[split.page_content for split in unique_splits],\n",
- " persist_directory=\"./chroma_db\",\n",
- " documents=unique_splits,\n",
- " embedding=UpstageEmbeddings(model=\"solar-embedding-1-large\"),\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "
introduced bugs immediately. Several bug-finding techni-
ques c\n"
- ]
- }
- ],
- "source": [
- "search_result = retriever.invoke(\"How to find problems in code?\")\n",
- "print(search_result[0].page_content[:100])"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
From 18541b1c86d9230bec2869de8a4ffbf71bfa497a Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Thu, 3 Oct 2024 09:50:02 -0700
Subject: [PATCH 6/7] update 05-1
---
.../05_1_ChromaDB.ipynb | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename 05_1_ChromaDB.ipynb => Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb (100%)
diff --git a/05_1_ChromaDB.ipynb b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
similarity index 100%
rename from 05_1_ChromaDB.ipynb
rename to Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
From 362f9ee8f01ab133c978bc3ecf0987e903b96146 Mon Sep 17 00:00:00 2001
From: Hyesoo Kim <100982596+duper203@users.noreply.github.com>
Date: Mon, 7 Oct 2024 09:50:02 -0700
Subject: [PATCH 7/7] Update 05_1_ChromaDB.ipynb : env setting
---
Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb | 46 ++++++++++-----------
1 file changed, 23 insertions(+), 23 deletions(-)
diff --git a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
index 56aabd0..0c369e2 100644
--- a/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
+++ b/Solar-Fullstack-LLM-101/05_1_ChromaDB.ipynb
@@ -69,29 +69,29 @@
},
"outputs": [],
"source": [
- "# @title set API key\n",
- "import os\n",
- "import getpass\n",
- "from pprint import pprint\n",
- "import warnings\n",
- "\n",
- "warnings.filterwarnings(\"ignore\")\n",
- "\n",
- "from IPython import get_ipython\n",
- "\n",
- "if \"google.colab\" in str(get_ipython()):\n",
- " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
- " from google.colab import userdata\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
- "else:\n",
- " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
- " from dotenv import load_dotenv\n",
- "\n",
- " load_dotenv()\n",
- "\n",
- "if \"UPSTAGE_API_KEY\" not in os.environ:\n",
- " os.environ[\"UPSTAGE_API_KEY\"] = getpass.getpass(\"Enter your Upstage API key: \")\n"
- ]
+ "# @title set API key\n",
+ "from pprint import pprint\n",
+ "import os\n",
+ "\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "if \"google.colab\" in str(get_ipython()):\n",
+ " # Running in Google Colab. Please set the UPSTAGE_API_KEY in the Colab Secrets\n",
+ " from google.colab import userdata\n",
+ "\n",
+ " os.environ[\"UPSTAGE_API_KEY\"] = userdata.get(\"UPSTAGE_API_KEY\")\n",
+ "else:\n",
+ " # Running locally. Please set the UPSTAGE_API_KEY in the .env file\n",
+ " from dotenv import load_dotenv\n",
+ "\n",
+ " load_dotenv()\n",
+ "\n",
+ "assert (\n",
+ " \"UPSTAGE_API_KEY\" in os.environ\n",
+ "), \"Please set the UPSTAGE_API_KEY environment variable\""
+ ]
},
{
"cell_type": "code",