diff --git a/.gitignore b/.gitignore index a1987fe..8ffc2ab 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ .vscode/ .vscode/launch.json backup -chroma_boardgames/ +vector_store/ env/ .env __pycache__ diff --git a/README.md b/README.md index e8dee1c..4f9f94e 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ Clone this repository and create a clean python v3.10 virtual environment and ac The following is a high-level list of components used to run this local RAG: - langchain -- streamlit -- streamlit-chat +- ollama - pypdf - chromadb - fastembed @@ -37,7 +36,7 @@ Create a `.env` file in the root directory and add the following environment var ```.env -CHROMA_PATH=chroma_boardgames +CHROMA_DB_PATH_PDF=chroma_boardgames DATA_PATH_BG=data_boardgames ``` @@ -53,7 +52,7 @@ If you need to clear the database for any reason, run: The above command will remove the chroma database. If you need to recreate it, simply rerun `populate_database.py` -## Running the RAG +## Running the RAG from the commandline: The instruction manuals for both Monopoly and Ticket To Ride have been loaded into the Chroma DB. Ask the RAG questions about these two board games and see how well it does answering your questions. The RAG can be invoked using the following command with the sample question: @@ -68,6 +67,14 @@ Here are some additional questions you can try: You can also browse the instruction manuals that are in the `./data_boardgames` folder to come up with your own questions. +## Running the FASTAPI server to expose the RAG via API + +Start the FASTPI server to expose api's that can be called from the ai_rag_ui or curl. + +``` +python query_data.py How do I build a hotel in monopoly? +``` + ## Running the test cases ``` diff --git a/ai_local_rag/__init__.py b/ai_local_rag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api.py b/ai_local_rag/api.py similarity index 79% rename from api.py rename to ai_local_rag/api.py index 2f3627a..b43cd89 100644 --- a/api.py +++ b/ai_local_rag/api.py @@ -1,11 +1,13 @@ import logging -from query_data import query_rag -from fastapi import FastAPI, Request, HTTPException, status + +from fastapi import FastAPI, HTTPException, Request, status from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse from pydantic import BaseModel -from models.rag_query_model import QueryInput, QueryOutput -from utils.async_utils import async_retry + +from ai_local_rag.models.rag_query_model import QueryInput, QueryOutput +from ai_local_rag.query_data import query_rag +from ai_local_rag.utils.async_utils import async_retry class Message(BaseModel): @@ -23,6 +25,10 @@ class Message(BaseModel): channel_list = ["general", "dev", "marketing"] message_map = {} +logging.basicConfig() +logger = logging.getLogger("api") +logger.setLevel(logging.DEBUG) + @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc: RequestValidationError): @@ -60,20 +66,21 @@ def post_message(message: Message): @app.post("/rag-query") async def query_rag_api(query: QueryInput): - print(f"api.py - API Request Data: {query}") + logger.info(f"api.py - API Request Data: {query}") query_response = query_rag({"input": query}) - print(query_response) + logger.debug(f"Query Response: - {query_response}") # query: str # response: str # sources: list[str] query_response2 = { "query": query, "response": query_response["response"], "sources": query_response["sources"]} - print(f"Query Response2: {query_response2}") + logger.info(f"Query Response2: {query_response2}") # query_response["intermediate_steps"] = [ # str(s) for s in query_response["intermediate_steps"] # ] + logger.debug(f"Query Response: - {query_response2}") return query_response2 @@ -83,7 +90,7 @@ async def query_rag_api2( query: QueryInput, ) -> QueryOutput: query_response = query_rag({"input": query}) - print(query_response) + logger.debug(f"Query Response: {query_response}") # query: str # response: str @@ -91,7 +98,7 @@ async def query_rag_api2( query_text = query["query"] query_response2 = { "query": query_text, "response": query_response["response"], "sources": query_response["sources"]} - print(f"Query Response2: {query_response2}") + logger.debug(f"Query Response2: {query_response2}") # query_response["intermediate_steps"] = [ # str(s) for s in query_response["intermediate_steps"] diff --git a/ai_local_rag/chroma/__init__.py b/ai_local_rag/chroma/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ai_local_rag/chroma/chroma_client.py b/ai_local_rag/chroma/chroma_client.py new file mode 100644 index 0000000..38ea2cc --- /dev/null +++ b/ai_local_rag/chroma/chroma_client.py @@ -0,0 +1,36 @@ +from chromadb.config import Settings +import chromadb +import logging +import os + +import dotenv +from langchain.vectorstores import Chroma + +# from ai_local_rag.utils.get_embedding_function import get_embedding_function + + +dotenv.load_dotenv() + + +logging.basicConfig() +logger = logging.getLogger("chroma_client") +logger.setLevel(logging.DEBUG) + + +chroma_path = os.getenv("CHROMA_PATH_SLACK") +collection_name = os.getenv("CHROMA_SLACK_COLLECTION") + + +client = chromadb.Client(Settings( + persist_directory=chroma_path +)) + +logger.info(f"Number of collections: {client.count_collections()}") + +logger.info(client.list_collections()) +""" +collection = client.get_collection(name=collection_name) +result = collection.query(n_results=5) +logging.info(result) +logging.info(len(client.list_collections())) +""" diff --git a/populate_database.py b/ai_local_rag/chroma/populate_database.py similarity index 90% rename from populate_database.py rename to ai_local_rag/chroma/populate_database.py index 8e07dd7..b76fbcb 100644 --- a/populate_database.py +++ b/ai_local_rag/chroma/populate_database.py @@ -1,16 +1,16 @@ import argparse import os -from dotenv import load_dotenv - - import shutil + +from dotenv import load_dotenv +from langchain.schema.document import Document # from langchain.document_loaders.pdf import PyPDFDirectoryLoader from langchain_community.document_loaders import PyPDFDirectoryLoader -from langchain_text_splitters import RecursiveCharacterTextSplitter -from langchain.schema.document import Document -from get_embedding_function import get_embedding_function # from langchain.vectorstores.chroma import Chroma from langchain_community.vectorstores import Chroma +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from ai_local_rag.utils.get_embedding_function import get_embedding_function_for_pdf # Load Config Settings load_dotenv() # take environment variables from .env. @@ -56,10 +56,10 @@ def _split_documents(documents: list[Document]): def _add_to_chroma(chunks: list[Document]): - chroma_path = os.getenv("CHROMA_PATH") + chroma_db_path_pdf = os.getenv("CHROMA_DB_PATH_PDF") # Load the existing database. db = Chroma( - persist_directory=chroma_path, embedding_function=get_embedding_function() + persist_directory=chroma_db_path_pdf, embedding_function=get_embedding_function_for_pdf() ) # Calculate Page IDs. @@ -119,10 +119,10 @@ def clear_database(): Remove the database so that we can rebuild it """ load_dotenv() # take environment variables from .env. - chroma_path = os.getenv("CHROMA_PATH") + chroma_db_path = os.getenv("CHROMA_DB_PATH_PDF") - if os.path.exists(chroma_path): - shutil.rmtree(chroma_path) + if os.path.exists(chroma_db_path): + shutil.rmtree(chroma_db_path) if __name__ == "__main__": diff --git a/ai_local_rag/chroma/reset_database.py b/ai_local_rag/chroma/reset_database.py new file mode 100644 index 0000000..63eb923 --- /dev/null +++ b/ai_local_rag/chroma/reset_database.py @@ -0,0 +1,10 @@ +import os +from dotenv import load_dotenv +import ai_local_rag.chroma.populate_database as populate_database + +# Load Config Settings +load_dotenv() # take environment variables from .env. +chroma_db_path_pdf = os.getenv("CHROMA_DB_PATH_PDF") + +populate_database.clear_database() +print(f"Removed all content from database {chroma_db_path_pdf}") diff --git a/ai_local_rag/chroma/slack_loader.py b/ai_local_rag/chroma/slack_loader.py new file mode 100644 index 0000000..6a09009 --- /dev/null +++ b/ai_local_rag/chroma/slack_loader.py @@ -0,0 +1,286 @@ +""" +Loader for slack +""" +import os +import logging +import dotenv + +import numpy as np +from dotenv import load_dotenv +import chromadb +from langchain import hub +from langchain.agents import AgentExecutor, create_openai_tools_agent +from langchain.schema.document import Document +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.agent_toolkits import SlackToolkit +from langchain_community.chat_models import ChatOllama +from langchain_community.document_loaders import SlackDirectoryLoader +from langchain_community.vectorstores import Chroma +from ai_local_rag.utils.get_embedding_function import get_embedding_function_for_slack + +dotenv.load_dotenv() +logging.basicConfig() +logger = logging.getLogger("slack_loader") +logger.setLevel(logging.DEBUG) + +chroma_path = os.getenv("CHROMA_DB_PATH_SLACK") +chroma_collection = os.getenv("CHROMA_SLACK_COLLECTION") + + +def slack_toolkit(): + toolkit = SlackToolkit() + my_tools = toolkit.get_tools() + + # llm = ChatOpenAI(temperature=0, model="gpt-4") + llm = ChatOllama(model="mistral") + + prompt = hub.pull("hwchase17/openai-tools-agent") + agent = create_openai_tools_agent( + tools=my_tools, + llm=llm, + prompt=prompt, + ) + agent_executor = AgentExecutor(agent=agent, tools=my_tools, verbose=True) + agent_executor.invoke( + { + "input": "Send a greeting to my coworkers in the #general channel. Note use `channel` as key of channel id, and `message` as key of content to sent in the channel." + } + ) + agent_executor.invoke( + {"input": "How many channels are in the workspace? Please list out their names."} + ) + agent_executor.invoke( + { + "input": "Tell me the number of messages sent in the #introductions channel from the past month." + } + ) + + +def _slack_loader(): + local_zip_file = os.getenv("SLACK_EXPORT_ZIP") + slack_workspace_url = os.getenv("SLACK_WORKSPACE_URL") + + loader = SlackDirectoryLoader(local_zip_file, slack_workspace_url) + docs = loader.load() + logger.info(f"Slack export contains {len(docs)} docs") + return docs + + +def _split_documents(documents: list[Document]): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=800, + chunk_overlap=80, + length_function=len, + is_separator_regex=False, + ) + return text_splitter.split_documents(documents) + + +def _print_chunks(chunks): + i = 0 + for chunk in chunks: + logger.info(f"chunk {i} contains: {chunk}\n") + i += 1 + if i > 5: + break + + +def _calculate_chunk_ids(chunks: list[Document]): + + # This will create IDs like "c-linkedtrust:U069UCY6WPL:1721902091.115329:2" + # Channel : UserId: Timestamp: Chunk Index + + # Document format is:[Document(metadata={'soource': 'xxxx'})] + + last_page_id = None + current_chunk_index = 0 + + response = {} + chunk_id_list = [] + metadata_list = [] + page_content = [] + + for chunk in chunks: + user = chunk.metadata.get("user") + channel = chunk.metadata.get("channel") + timestamp = chunk.metadata.get("timestamp") + current_page_id = f"{channel}:{user}:{timestamp}" + + page_content.append(chunk.page_content) + + # If the page ID is the same as the last one, increment the index. + if current_page_id == last_page_id: + current_chunk_index += 1 + else: + current_chunk_index = 0 + + # Calculate the chunk ID. + chunk_id = f"{current_page_id}:{current_chunk_index}" + last_page_id = current_page_id + + # Add the id to the page meta-data. + chunk.metadata["id"] = chunk_id + + # Add the metadata to the metadata list + metadata_list.append(chunk.metadata) + + # Add it to the list of ids + chunk_id_list.append(chunk_id) + + response["texts"] = page_content + response["ids"] = chunk_id_list + response["metadatas"] = metadata_list + + # return chunks + return response + + +def _add_to_chroma_with_langchain(chunks: list[Document]): + + logger.info(f"_add_to_chroma - collection name: {chroma_collection}") + # Load the existing database. + db = Chroma(collection_name=chroma_collection, + persist_directory=chroma_path, embedding_function=get_embedding_function_for_slack() + ) + + db.persist() + + +def _add_to_chroma(chunks_with_ids: list[Document]): + + chroma_client = chromadb.PersistentClient(path=chroma_path) + # settings=Settings(chroma_db_impl="duckdb+parquet")) + + logger.info(f"_add_to_chroma - collection name: {chroma_collection}") + + # Create or load a collection + collection_name = chroma_collection + collection = None + try: + collection = chroma_client.get_or_create_collection(collection_name) + except Exception as e: + print(f"Error accessing collection: {e}") + + texts = chunks_with_ids['texts'] + metadatas = chunks_with_ids['metadatas'] + ids = chunks_with_ids['ids'] + + # Get the embeddings function + embedding_function = get_embedding_function_for_slack() + + # Generate the embeddings + embeddings = embedding_function(texts) + # print("Embeddings: ", embeddings) + + # Debugging: Print lengths of all inputs + logger.debug(f"Texts length: {len(texts)}\n") + logger.debug(f"Embeddings length: {len(embeddings)}\n") + logger.debug(f"Metadata length: {len(metadatas)}\n") + logger.debug(f"IDs length: {len(ids)}\n") + + # Ensure all lists have the same length + assert len(texts) == len(embeddings) == len(metadatas) == len( + ids), "Lengths of input lists do not match." + + # Ensure we have an embedding for each ID + for i in range(len(ids)): + print(f"ID: {ids[i]}") + print(f"Embedding: {embeddings[i]}") + + # Add text and embeddings to the collection + try: + collection.add(documents=texts, embeddings=embeddings, + metadatas=metadatas, ids=ids) + + except Exception as e: + print(f"Error adding to collection: {e}") + + logger.info("DONE LOADING and QUERING") + + +def _query(): + collection = [] + + # Create or load a collection + collection_name = chroma_collection + collection = None + chroma_client = chromadb.PersistentClient(path=chroma_path) + try: + collection = chroma_client.get_collection(collection_name) + except Exception as e: + print(f"Error accessing collection: {e}") + + # QUERY WITH METADATA + query_text = 'Great i have tasks on the front and in back so i will contact with both' + embedding_function = get_embedding_function_for_slack() + query_embedding = embedding_function(query_text) + query_include = ["metadatas", "distances", "embeddings"] + + # Ensure query_embedding is in the expected format + if isinstance(query_embedding, np.ndarray): + query_embedding = query_embedding.tolist() + + # Query the collection + try: + result = collection.query( + query_embedding, n_results=1, include=query_include) + print("Query Result: ", result) + except Exception as e: + logger.info(f"Error querying the collection: {e}") + + # Example: Query the collection and retrieve results with metadata + try: + results = collection.query( + query_embeddings=query_embedding, + n_results=5, # Number of results to retrieve + include=query_include + ) + + # Process and print results + ids = results.get('ids', []) + embeddings = results.get('embeddings', []) + distances = results.get('distances', []) + metadatas = results.get('metadatas', []) + + # Process and print results + # for i in range(len(ids)): + for i, result_id in enumerate(ids): + print(f"Result {i + 1}:") + print(f"ID: {result_id}") + + if distances: + print(f"Distance: {distances[i]}") + else: + print("Distance not available.") + + if metadatas: + print(f"Metadata: {metadatas[i]}") + else: + print("Metadata not available.") + + if embeddings: + print(f"Embedding: {embeddings[i]}") + else: + print("Embedding not available.") + + print("-" * 40) + except Exception as e: + print(f"Error querying the collection: {e}") + + +def main(): + # Load Config Settings + logger.info("STARTING") + load_dotenv() # take environment variables from .env. + documents = _slack_loader() + chunks = _split_documents(documents) + _print_chunks(chunks) + chunks = _calculate_chunk_ids(chunks) + _print_chunks(chunks) + _add_to_chroma(chunks) + _query() + logger.info("FINISHED") + + +if __name__ == "__main__": + main() diff --git a/models/rag_query_model.py b/ai_local_rag/models/rag_query_model.py similarity index 100% rename from models/rag_query_model.py rename to ai_local_rag/models/rag_query_model.py diff --git a/query_data.py b/ai_local_rag/query_data.py similarity index 87% rename from query_data.py rename to ai_local_rag/query_data.py index eea12be..b839045 100644 --- a/query_data.py +++ b/ai_local_rag/query_data.py @@ -6,11 +6,11 @@ from langchain.prompts import ChatPromptTemplate from langchain_community.llms.ollama import Ollama -from get_embedding_function import get_embedding_function +from ai_local_rag.utils.get_embedding_function import get_embedding_function_for_pdf # Load Config Settings load_dotenv() # take environment variables from .env. -CHROMA_PATH = chroma_path = os.getenv("CHROMA_PATH") +chroma_db_path_pdf = os.getenv("CHROMA_DB_PATH_PDF") PROMPT_TEMPLATE = """ Answer the question based only on the following context: @@ -34,8 +34,8 @@ def main(): def query_rag(query_text: str): # Prepare the DB. - embedding_function = get_embedding_function() - db = Chroma(persist_directory=CHROMA_PATH, + embedding_function = get_embedding_function_for_pdf() + db = Chroma(persist_directory=chroma_db_path_pdf, embedding_function=embedding_function) # Search the DB. diff --git a/ai_local_rag/query_slack_data.py b/ai_local_rag/query_slack_data.py new file mode 100644 index 0000000..db716a1 --- /dev/null +++ b/ai_local_rag/query_slack_data.py @@ -0,0 +1,216 @@ +import argparse +import logging +import os + +import chromadb +import numpy as np +from dotenv import load_dotenv + +from langchain.prompts import ChatPromptTemplate +from langchain_community.llms.ollama import Ollama + +from ai_local_rag.utils.get_embedding_function import \ + get_embedding_function_for_slack + +# Load Config Settings +load_dotenv() # take environment variables from .env. + + +logging.basicConfig() +logger = logging.getLogger("query_slack_data") +logger.setLevel(logging.DEBUG) + +chroma_path = os.getenv("CHROMA_DB_PATH_SLACK") +chroma_collection = os.getenv("CHROMA_SLACK_COLLECTION") + +verbose_str = os.getenv("VERBOSE").lower() +VERBOSE = False +if verbose_str == "true": + VERBOSE = True + +PROMPT_TEMPLATE = """ +The following is a relevant document based on your query about "{query}": + +Document ID: {doc_id} +Similarity Score: {score} +Document Text: +{doc_text} + +How can I assist you further with this information? +""" + + +def main(): + # Create CLI. + parser = argparse.ArgumentParser() + parser.add_argument("query_text", type=str, help="The query text.") + args = parser.parse_args() + query_text = args.query_text + query_slack_rag(query_text) + + +def _get_collection(): + # Create or load a collection + collection_name = chroma_collection + collection = None + chroma_client = chromadb.PersistentClient(path=chroma_path) + try: + collection = chroma_client.get_collection(collection_name) + + return collection + except Exception as e: + print(f"Error accessing collection: {e}") + + +def query_slack_rag(query_text: str): + # Load the collection + collection = _get_collection() + + # QUERY WITH METADATA + embedding_function = get_embedding_function_for_slack() + query_embedding = embedding_function(query_text) + query_include = ["metadatas", "documents", "distances", "embeddings"] + + # Ensure query_embedding is in the expected format + if isinstance(query_embedding, np.ndarray): + query_embedding = query_embedding.tolist() + + # Query the collection and retrieve results with the specified includes + try: + results = collection.query( + query_embeddings=query_embedding, + n_results=1, # Number of results to retrieve + include=query_include + ) + result_len = len(results.get('ids', [])) + logger.info(f"Query returned {result_len} item(s)\n") + + if VERBOSE: + + # Process and print results + ids = results.get('ids', []) + embeddings = results.get('embeddings', []) + distances = results.get('distances', []) + metadatas = results.get('metadatas', []) + documents = results.get('documents', []) + + # Process and print results + # for i in range(len(ids)): + for i, result_id in enumerate(ids): + logger.debug(f"Result {i + 1}:") + logger.debug(f"ID: {result_id}") + + if distances: + logger.debug( + f"Distance (Similarity Score) {i + 1}: {distances[i]}\n") + else: + logger.debug("Distance not available.") + + if documents: + logger.debug(f"Documents {i + 1}: {documents[i]}\n") + else: + logger.debug("Distance not available.") + + if metadatas: + logger.debug(f"Metadata {i + 1}: {metadatas[i]}\n") + else: + logger.debug("Metadata not available.") + + if embeddings: + # logger.debug(f"Embedding: {embeddings[i]}") + logger.debug(f"Retreived an embedding {i + 1}\n") + else: + print("Embedding not available.") + + logger.debug("-" * 40) + + ##### Query Complete #### + + # Extract the first result (most similar) + # Extract the first result (most similar) + result_id = results.get('ids', [None])[0] + result_metadata = results.get('metadatas', [None])[0] + result_text = results.get('documents', [None])[0] + result_distance = results.get('distances', [None])[0] + + logger.debug(f"Most similar result ID: {result_id}") + logger.debug(f"Distance (similarity score): {result_distance}\n") + logger.debug(f"Document text: {result_text}") + logger.debug(f"Metadata: {result_metadata}\n") + + ##### Build the Prompt #### + chat_prompt_template = ChatPromptTemplate.from_template( + PROMPT_TEMPLATE) + # Fill the template with actual data from the query result + filled_prompt = [] + filled_prompt.append(chat_prompt_template.format( + query=query_text[0], + doc_id=result_id[0], + score=round(result_distance[0], + 4) if result_distance is not None else "N/A", + doc_text=result_text[0] or "No document found." + )) + + logger.debug(f"Formatted Chat Prompt: {filled_prompt}\n") + + # Assume you have a language model set up (like an OpenAI model) + language_model = Ollama(model="mistral") + llm_result = language_model.generate(filled_prompt) + # max_tokens=50, + # temperature=0.7, + # num_return_sequences=3 + # logger.debug(f"\nLanguage Model's 'Generate' Response:") + # logger.debug(llm_result) + + # The 'invoke' function returns a much more simple response than 'generate' + # response2 = language_model.invoke(filled_prompt) + # logger.debug(f"\nLanguage Model's 'Invoke' Response:") + # logger.debug(response2) + + # Extracting elements from the LLMResult object + + # 1. Extract the generated texts + generated_texts = [ + generation.text for generation_list in llm_result.generations for generation in generation_list] + logger.debug(f"Generated Texts:") + for text in generated_texts: + logger.debug(f"- {text}") + + # 2. Extract token usage information (if available) + if llm_result.llm_output and "token_usage" in llm_result.llm_output: + token_usage = llm_result.llm_output["token_usage"] + total_tokens = token_usage.get("total_tokens", 0) + prompt_tokens = token_usage.get("prompt_tokens", 0) + completion_tokens = token_usage.get("completion_tokens", 0) + + logger.debug(f"Token Usage:") + logger.debug(f"- Total Tokens: {total_tokens}") + logger.debug(f"- Prompt Tokens: {prompt_tokens}") + logger.debug(f"- Completion Tokens: {completion_tokens}\n") + + # 3. Extract the model name (if available) + # model_name = llm_result.llm_output.get("model_name", "Unknown Model") + # print(f"\nModel Name: {model_name}") + + # 4. Optionally, handle run_info (if it's included in your version of LangChain) + # This would typically be done if the LLMResult included any runtime info you want to log or analyze. + # Example: + run_info = llm_result.run + logger.debug(f"Run Info: {run_info}\n") + # if run_info: + # print("\nRun Info:") + # for key, value in run_info.items(): + # print(f"{key}: {value}") + + # Format the response message + response_message = { + "query": query_text, "response": generated_texts, "sources": result_metadata[0]['source'], "channel": result_metadata[0]['channel']} + + logger.debug(f"Response Message: {response_message}\n") + return response_message + except Exception as e: + print(f"Error querying the collection: {e}") + + +if __name__ == "__main__": + main() diff --git a/ai_local_rag/utils/__init__.py b/ai_local_rag/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/async_utils.py b/ai_local_rag/utils/async_utils.py similarity index 100% rename from utils/async_utils.py rename to ai_local_rag/utils/async_utils.py diff --git a/ai_local_rag/utils/get_embedding_function.py b/ai_local_rag/utils/get_embedding_function.py new file mode 100644 index 0000000..ae8add6 --- /dev/null +++ b/ai_local_rag/utils/get_embedding_function.py @@ -0,0 +1,95 @@ +import os + +import dotenv + +import numpy as np + +from langchain_community.embeddings.ollama import OllamaEmbeddings +from langchain_openai import OpenAIEmbeddings +from chromadb.utils import embedding_functions +from sentence_transformers import SentenceTransformer +import nomic +from nomic import embed +# from langchain_community.embeddings.bedrock import BedrockEmbeddings +# from langchain_community.embeddings import FastEmbedEmbeddings +dotenv.load_dotenv() + + +def get_embedding_function_for_slack(): + # embeddings = BedrockEmbeddings( + # credentials_profile_name="default", region_name="us-east-1" + # ) + + # Make sure you run this first: ollama pull nomic-embed-text + # ollama_model = OllamaModel("nomic-embed-text") + # embeddings = CustomOllamaEmbedding("nomic-embed-text-v1") + embeddings = CustomSentenceTransformerEmbedding('paraphrase-MiniLM-L6-v2') + + # embeddings = embedding_functions.DefaultEmbeddingFunction() + + # embeddings = embedding_functions.SentenceTransformerEmbeddingFunction( + # model_name="all-MiniLM-L6-v2") + + # embeddings = FastEmbedEmbeddings() + return embeddings + + +def get_embedding_function_for_pdf(): + # embeddings = BedrockEmbeddings( + # credentials_profile_name="default", region_name="us-east-1" + # ) + + # Make sure you run this first: ollama pull nomic-embed-text + embeddings = OllamaEmbeddings(model="nomic-embed-text") + + # embeddings = embedding_functions.DefaultEmbeddingFunction() + # embeddings = embedding_functions.SentenceTransformerEmbeddingFunction( + # model_name="all-MiniLM-L6-v2") + # embeddings = FastEmbedEmbeddings() + return embeddings + + +class CustomOpenAIEmbeddings(OpenAIEmbeddings): + + def __init__(self, openai_api_key, *args, **kwargs): + super().__init__(openai_api_key=openai_api_key, *args, **kwargs) + + def _embed_documents(self, texts): + embeddings = [ + self.client.create( + input=text, model="text-embedding-ada-002").data[0].embedding + for text in texts + ] + return embeddings + + def __call__(self, input): + return self._embed_documents(input) + + +class CustomOllamaEmbedding: + def __init__(self, model_name): + self.model_name = model_name + + def __call__(self, input): + nomic_api_key = os.getenv("NOMIC_API_KEY") + nomic.login(nomic_api_key) + if isinstance(input, str): + input = [input] + # Assuming nomic library provides a function to embed text + embeddings = embed.text(input, model=self.model_name) + return embeddings + + +class CustomSentenceTransformerEmbedding: + def __init__(self, model_name): + self.model = SentenceTransformer(model_name) + + def __call__(self, input): + if isinstance(input, str): + input = [input] + # Generate embeddings + embeddings = self.model.encode(input) + # Ensure embeddings are in list format if using NumPy arrays + if isinstance(embeddings, np.ndarray): + embeddings = embeddings.tolist() + return embeddings diff --git a/get_embedding_function.py b/get_embedding_function.py deleted file mode 100644 index cd9563e..0000000 --- a/get_embedding_function.py +++ /dev/null @@ -1,14 +0,0 @@ -from langchain_community.embeddings.ollama import OllamaEmbeddings -# from langchain_community.embeddings.bedrock import BedrockEmbeddings -# from langchain_community.embeddings import FastEmbedEmbeddings - - -def get_embedding_function(): - # embeddings = BedrockEmbeddings( - # credentials_profile_name="default", region_name="us-east-1" - # ) - - # Make sure you run this first: ollama pull nomic-embed-text - embeddings = OllamaEmbeddings(model="nomic-embed-text") - # embeddings = FastEmbedEmbeddings() - return embeddings diff --git a/requirements.txt b/requirements.txt index 18d9d39..9b42277 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ click==8.1.7 coloredlogs==15.0.1 dataclasses-json==0.6.7 Deprecated==1.2.14 +distro==1.9.0 dnspython==2.6.1 email_validator==2.2.0 exceptiongroup==1.2.2 @@ -44,13 +45,16 @@ importlib_resources==6.4.0 iniconfig==2.0.0 Jinja2==3.1.4 jmespath==1.0.1 +joblib==1.4.2 jsonpatch==1.33 jsonpointer==3.0.0 kubernetes==30.1.0 langchain==0.2.10 langchain-community==0.2.9 -langchain-core==0.2.22 +langchain-core==0.2.24 +langchain-openai==0.1.19 langchain-text-splitters==0.2.2 +langchainhub==0.1.20 langsmith==0.1.93 loguru==0.7.2 markdown-it-py==3.0.0 @@ -62,10 +66,12 @@ monotonic==1.6 mpmath==1.3.0 multidict==6.0.5 mypy-extensions==1.0.0 +networkx==3.3 numpy==1.26.4 oauthlib==3.2.2 onnx==1.16.1 onnxruntime==1.18.1 +openai==1.37.1 opentelemetry-api==1.25.0 opentelemetry-exporter-otlp-proto-common==1.25.0 opentelemetry-exporter-otlp-proto-grpc==1.25.0 @@ -97,23 +103,34 @@ python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-multipart==0.0.9 PyYAML==6.0.1 +regex==2024.7.24 requests==2.32.3 requests-oauthlib==2.0.0 rich==13.7.1 rsa==4.9 s3transfer==0.10.2 +safetensors==0.4.3 +scikit-learn==1.5.1 +scipy==1.14.0 +sentence-transformers==3.0.1 shellingham==1.5.4 six==1.16.0 +slack_sdk==3.31.0 sniffio==1.3.1 snowballstemmer==2.2.0 SQLAlchemy==2.0.31 starlette==0.37.2 sympy==1.13.1 tenacity==8.5.0 +threadpoolctl==3.5.0 +tiktoken==0.7.0 tokenizers==0.19.1 tomli==2.0.1 +torch==2.4.0 tqdm==4.66.4 +transformers==4.43.3 typer==0.12.3 +types-requests==2.32.0.20240712 typing-inspect==0.9.0 typing_extensions==4.12.2 urllib3==2.2.2 diff --git a/reset_database.py b/reset_database.py deleted file mode 100644 index 50dc21c..0000000 --- a/reset_database.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -from dotenv import load_dotenv -import populate_database - -# Load Config Settings -load_dotenv() # take environment variables from .env. -CHROMA_PATH = chroma_path = os.getenv("CHROMA_PATH") - -populate_database.clear_database() -print(f"Removed all content from database {CHROMA_PATH}") - diff --git a/data_boardgames/monopoly.pdf b/source_data/data_boardgames/monopoly.pdf similarity index 100% rename from data_boardgames/monopoly.pdf rename to source_data/data_boardgames/monopoly.pdf diff --git a/data_boardgames/ticket_to_ride.pdf b/source_data/data_boardgames/ticket_to_ride.pdf similarity index 100% rename from data_boardgames/ticket_to_ride.pdf rename to source_data/data_boardgames/ticket_to_ride.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture01.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture01.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture01.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture01.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture02.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture02.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture02.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture02.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture03.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture03.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture03.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture03.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture04.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture04.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture04.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture04.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture05.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture05.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture05.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture05.pdf diff --git a/data_machine_learning_lectures/MachineLearning-Lecture06.pdf b/source_data/data_machine_learning_lectures/MachineLearning-Lecture06.pdf similarity index 100% rename from data_machine_learning_lectures/MachineLearning-Lecture06.pdf rename to source_data/data_machine_learning_lectures/MachineLearning-Lecture06.pdf diff --git a/source_data/data_slack/WhatsCookinTeam Slack export Jul 25 2024 - Jul 26 2024.zip b/source_data/data_slack/WhatsCookinTeam Slack export Jul 25 2024 - Jul 26 2024.zip new file mode 100644 index 0000000..65b5635 Binary files /dev/null and b/source_data/data_slack/WhatsCookinTeam Slack export Jul 25 2024 - Jul 26 2024.zip differ diff --git a/test_rag.py b/test_rag.py index e19ed03..3b813b2 100644 --- a/test_rag.py +++ b/test_rag.py @@ -1,4 +1,4 @@ -from query_data import query_rag +from ai_local_rag.query_data import query_rag from langchain_community.llms.ollama import Ollama EVAL_PROMPT = """ @@ -37,11 +37,13 @@ def query_and_validate(question: str, expected_response: str): if "true" in evaluation_results_str_cleaned: # Print response in Green if it is correct. - print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") + print("\033[92m" + + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") return True elif "false" in evaluation_results_str_cleaned: # Print response in Red if it is incorrect. - print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") + print("\033[91m" + + f"Response: {evaluation_results_str_cleaned}" + "\033[0m") return False else: raise ValueError(