diff --git a/.gitignore b/.gitignore index 1ac4c6e..7bc5255 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,12 @@ Test_Results/results_test_1-6-24.csv Test_Results/results_test_1-3-24_log.txt Test_Results/results_test_1-3-24.csv Test_Results/results_test_1-2-24_log.txt + +venv/ +__pycache__/ +__pycache__/*.pyc + +data/ +images.txt +src/*.pdf +src/*.txt \ No newline at end of file diff --git a/TreeHugger_Exam_ans.csv b/TreeHugger_Exam_ans.csv deleted file mode 100644 index 488f052..0000000 --- a/TreeHugger_Exam_ans.csv +++ /dev/null @@ -1,76 +0,0 @@ -,Question,Answer -0,True or False Magnolia grandiflora is evergreen.,True -1,True or False Pacific yew has a rapid growth rate.,False -2,True or False Platanus occidentalis is native to the east coast.,True -3,"True or False The cones red spruce are quite long, between 6 and 8” in length",False -4,True or False Douglas-fir is not a true fir,True -5,True or False Red pine is considered a hard pine,True -6,True or False The branches of northern white-cedar can take root if the tree falls,True -7,True or False Abies grandis is native to the southeastern United States,False -8,True or False Pinus resinosa is native along the Gulf coast,False -9,True or False Larix laricina is an evergreen conifer.,False -10,True or False Sweetgum is the most common and widely distributed tree in the Mississippi delta,True -11,True or False The twigs of slippery elm are thinner than those of American elm,False -12,True or False Quercus rubra is the most important and widespread of northern oaks,True -13,True or False White spruce is intolerant of shade,False -14,True or False The needles of baldcypress are two-ranked,True -15,"True or False Southern magnolia is a nonnative, invasive species in the southeastern U.S",False -16,True or False The timber of black oak is sold as red oak.,True -17,True or False Thuja plicata is native to West Virginia and Pennsylvania,False -18,True or False The fruit of bur oak has a fringed cap,True -19,True or False Yellow birch has a terminal bud,False -20,True or False Redwood grows in the fog belt,True -21,True or False Betula papyrifera is the most widely distributed of the native birches.,True -22,True or False Red alder is native to Guatemala,False -23,True or False Magnolia acuminata is deciduous,True -24,True or False The twigs of quaking aspen are grey in color,False -25,True or False Virginia pine is used for reclamation,True -26,True or False The cones of Thuja plicata hang downwards,False -27,True or False The cones of Abies point upward,True -28,True or False The firs discussed in lecture are commonly seen growing in the forests of West Virginia,False -29,True or False The timber of Tsuga heterophylla is sold as “hem/spruce”,False -31,"The fruits of eastern Tsuga heterophylla are: A. small purple berries, B. blue colored berries, C. red drupes, D. light brown colored cones",D. Are Blue Drupes -32,"The fruits of Paulownia tomentosa: A. are woody capsules, B. are orange berries, C. are small cones, D. are blue drupes.",A. Are woddy Capsules -33,"The bark of western redcedar: A. is deeply ridged and furrowed, B. is fibrous, C. is smooth and shiny, D. is covered in resin blisters.",B. is fibrous -34,"Incense-cedar: A. has duck bill like cones, B. is native to the northeastern U.S., C. has small upward facing cones, D. has glaucous, purplish cones.",A. has duck bill like cones -35,"Which of the following species is NOT in the Pinaceae family? A. Taxus brevifolia, B. Tsuga heterophylla, C. Larix laricina, D. Abies procera.",A. Taxus brevifolia -36,"Bush honeysuckles: A. are ok to leave growing on lands that you manage, B. are native to South America, C. have opposite leaves, D. have fruits that are small legumes",C. Have opposite Leaves -37,"Loblolly pine: A. is a small pine native to Central Appalachia, B. has a slow growth rate, C. is considered a soft pine, D. is the leading commercial timber tree in the southeast.",D. Is the leading commercial timber tree in the southeast -38,"Pacific yew: A. has needles that are in fascicles of 2, B. is a large sized tree, reaching heights of 100 to 130 feet, C. prefers dry sites, D. is a source of Taxol for chemotherapy",D. is a source of taxol for chemotherapy -39,"Incense-cedar is in the ______ genus: A. Tsuga, B. Calocedrus, C. Thuja, D. Juniperus.",B. Calocedrus -40,"The bark of redwood: A. is smooth and gray, B. is white and deeply furrowed, C. fibrous and up to 1 foot thick, D. covered in corky warts and ridges.",C. fibrous and up to 1 foot thick -41,"The fruit of Liquidambar styraciflua: A. is a head of strongly beaked capsules, B. a brown berry, C. a yellowish brown cone, D. an aggregate of samaras",A. is a head of strongly beaked capsules -42,"The cones of giant sequoia: A. are 8” to 12” in length, B. have peltate scales, C. are bright yellow in color, D. have a blue aril surrounding the seed",B. Have peltate scales -43,"Which of the following species is in the Fagaceae family?: A. Alnus rubra, B. Ostrya virginiana, C. Lithocarpus densiflorus, D. Tilia Americana",C. Lithocapus densiflorus -44,"The fruit of ginkgo: A. has a foul odor when ripe, B. is a large brown cone, C. is a yellow berry, D. has peltate scales",A. has a foul oder when ripe -45,"The fruit of American sycamore: A. is a woody capsule, B. is a fused, aggregate of achenes, C. is a brown berry, D. is an aggregate of samaras.","B. is a fused, aggregate of achenes " -46,"Calocedrus decurrens: A. is native to swamps in the southeastern U.S., B. has a pinnately compound leaf, C. is native to the western U.S., D. is native to the Great Lakes states",C. is native to the western US -47,"Yellow-poplar: A. has a rapid growth rate, B. is very tolerant of shade, C. has a very slow growth rate, D. is in the Salicaceae family.",A. has a rapid growth rate -48,"The genus Abies is in the: A. the Cupressaceae family, B. the Taxodiaceae family, C. Abeaceae family, D. Pinaceae family",D. penaceae family -49,"Taxus brevifolia: A. is native to the mountains of West Virginia, B. is native to the state of Texas, C. is native to the Pacific northwest, D. is native to the Canadian province of New Brunswick.",C. is native to the pacific northwest -50,"Pecan: A. is a species of hickory, B. is in the Fagaceae family, C. is in the Betulaceae family, D. is native primarily to the northeastern United States.",A. is a species of hickory -53,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Willow Oak B. Seed of Pecific Yew C. Sequoiadendron giganteum D.Carya Cordiformis E. 3 years in grass stage Question to match: Scarlet aril,B. seed of pacific yew -54,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Willow Oak B. Seed of Pecific Yew C. Sequoiadendron giganteum D.Carya Cordiformis E. 3 years in grass stage Question to match: Mustard yellow buds,D. Carya Cordiformis -55,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Willow Oak B. Seed of Pecific Yew C. Sequoiadendron giganteum D.Carya Cordiformis E. 3 years in grass stage Question to match: Pinus Plaustris,E. 3 years in grass stage -56,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Willow Oak B. Seed of Pecific Yew C. Sequoiadendron giganteum D.Carya Cordiformis E. 3 years in grass stage Question to match: Quercus Phellos,A. willow oak -57,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Willow Oak B. Seed of Pecific Yew C. Sequoiadendron giganteum D.Carya Cordiformis E. 3 years in grass stage Question to match: World's laregst tree,C. Sequoiadendron giganteum -59,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Rhus Tyhina B. Sassafras Albidum C. Cypress Knee D. Most important commerical Timeber Tree in Canada E. Wa 'kaulua Question to match: Taxodium distichum,C. Cypress knee -60,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Rhus Tyhina B. Sassafras Albidum C. Cypress Knee D. Most important commerical Timeber Tree in Canada E. Wa 'kaulua Question to match: Staghorn Sumac,A. Rhus typhina -61,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Rhus Tyhina B. Sassafras Albidum C. Cypress Knee D. Most important commerical Timeber Tree in Canada E. Wa 'kaulua Question to match: Safrole,B. sassafras albidum -62,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Rhus Tyhina B. Sassafras Albidum C. Cypress Knee D. Most important commerical Timeber Tree in Canada E. Wa 'kaulua Question to match: Douglas-fir,E wa'kaulua -63,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Rhus Tyhina B. Sassafras Albidum C. Cypress Knee D. Most important commerical Timeber Tree in Canada E. Wa 'kaulua Question to match: White spruce,D. most important commercial timber tree in canada -67,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. tallest tree on earth B. Spruce that was discussed, that is native only to eastern U.S. C. Aggregate of smaras D. Also called Aborvitae E. Tyloses Question to Answer: Red Spruce","B. Spruce that was discussed, that is native " -68,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. tallest tree on earth B. Spruce that was discussed, that is native only to eastern U.S. C. Aggregate of smaras D. Also called Aborvitae E. Tyloses Question to Answer: redwood",A. Tallest tree on earth -69,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. tallest tree on earth B. Spruce that was discussed, that is native only to eastern U.S. C. Aggregate of smaras D. Also called Aborvitae E. Tyloses Question to Answer: White Oak group",E. tyloses -70,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. tallest tree on earth B. Spruce that was discussed, that is native only to eastern U.S. C. Aggregate of smaras D. Also called Aborvitae E. Tyloses Question to Answer: Yellow-poplar",C. aggregate of samaras -71,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. tallest tree on earth B. Spruce that was discussed, that is native only to eastern U.S. C. Aggregate of smaras D. Also called Aborvitae E. Tyloses Question to Answer: northern white-cedar",D. also called arborvitac -73,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Pink to Purple flowers, invasive B. Most important hard wood in the pacific northwest C. Pencil Stock D. also called buttonwod E. Corylus Question to Answer: American Sycamore",D. also called button wod -74,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Pink to Purple flowers, invasive B. Most important hard wood in the pacific northwest C. Pencil Stock D. also called buttonwod E. Corylus Question to Answer: Paulownia tomentosa","A. pink to purple flowers, invasice " -75,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Pink to Purple flowers, invasive B. Most important hard wood in the pacific northwest C. Pencil Stock D. also called buttonwod E. Corylus Question to Answer: Betulaceae family",E Corylus -76,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Pink to Purple flowers, invasive B. Most important hard wood in the pacific northwest C. Pencil Stock D. also called buttonwod E. Corylus Question to Answer: Alnus Rubra",B. Most important hardwood in the pacific northwest -77,"Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Pink to Purple flowers, invasive B. Most important hard wood in the pacific northwest C. Pencil Stock D. also called buttonwod E. Corylus Question to Answer: Calocedrus decurrens",C. Pencil stock -79,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Suction cup like case B. Protetion of hihg altitude watersheds C. Salix nigra D. loblolly pine E. fastest growin gof all the southern pines Question to Answer: Once used for makin gbaskets,C. Salix nigra -80,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Suction cup like case B. Protetion of hihg altitude watersheds C. Salix nigra D. loblolly pine E. fastest growin gof all the southern pines Question to Answer: pinus Taeda,D. Loblolly pine -81,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Suction cup like case B. Protetion of hihg altitude watersheds C. Salix nigra D. loblolly pine E. fastest growin gof all the southern pines Question to Answer: Larix occidentalis,E. Fastest growing in the southern pines -82,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Suction cup like case B. Protetion of hihg altitude watersheds C. Salix nigra D. loblolly pine E. fastest growin gof all the southern pines Question to Answer: Abies,A. Suction cup like base -83,Direction: Place the letter of the best match from the right-hand list in the blank next to the item in the left-hand list below A. Suction cup like case B. Protetion of hihg altitude watersheds C. Salix nigra D. loblolly pine E. fastest growin gof all the southern pines Question to Answer: Pilus elliottii,B. Protection of high altitude watersheds diff --git a/data/chroma_db/chroma.sqlite3 b/data/chroma_db/chroma.sqlite3 new file mode 100644 index 0000000..58d0468 Binary files /dev/null and b/data/chroma_db/chroma.sqlite3 differ diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000..bba7569 --- /dev/null +++ b/src/app.py @@ -0,0 +1,40 @@ +# __import__('pysqlite3') +# import sys +# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') +# import sqlite3 + +import streamlit as st +from components.frontend.chat import Chat_UI +from components.frontend.sidebar import Sidebar +from components.backend.pipeline.pipeline import Pipeline +import os +import uuid + + +st.set_page_config(layout='wide') + + +@st.cache_resource +def initalize(): + pipeline = Pipeline() + return pipeline, Sidebar(pipeline), Chat_UI(pipeline) + +class UI: + def __init__(self): + self._pipeline, self.sidebar, self.chat = initalize() + st.session_state['documents'] = False + st.session_state['user_id'] = str(uuid.uuid4()) + st.session_state['api_key'] = "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" + + if 'messages' not in st.session_state: + st.session_state['messages'] = [] + + def render(self): + self.sidebar() + self.chat() + +def main(): + UI().render() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/assets/eugenie.png b/src/assets/eugenie.png new file mode 100644 index 0000000..7fc9ffc Binary files /dev/null and b/src/assets/eugenie.png differ diff --git a/src/components/backend/pipeline/document_handler.py b/src/components/backend/pipeline/document_handler.py new file mode 100644 index 0000000..15ecea6 --- /dev/null +++ b/src/components/backend/pipeline/document_handler.py @@ -0,0 +1,64 @@ +import numpy as np +import fitz +import requests +import time as time +import uuid +import streamlit as st +import base64 + +class Document_Handler: + def __init__(self): + pass + + def __call__(self, bytes_array): + return self.extract_and_chunk(bytes_array) + + def semantic_chunking(self, text, chunk_size=200, overlap=50): + chunks = [] + current_chunk = "" + words = text.split() + + for word in words: + current_chunk += (word + " ") + if len(current_chunk) >= chunk_size: + period_pos = current_chunk.rfind('. ') + if period_pos != -1 and period_pos + 1 < len(current_chunk): + chunks.append(current_chunk[:period_pos + 1]) + current_chunk = current_chunk[max(period_pos + 1 - overlap, 0):] + else: + chunks.append(current_chunk.strip()) + current_chunk = "" + + if len(current_chunk) > chunk_size // 2: + chunks.append(current_chunk.strip()) + + return chunks + + def extract_and_chunk(self, file_name): + doc = fitz.open(file_name) + + text_blocks = [] + id = file_name + + for page_num in range(len(doc)): + page = doc[page_num] + blocks = page.get_text("dict")["blocks"] + + for b in blocks: + if "lines" in b: + bbox = fitz.Rect(b["bbox"]) + text = " ".join([" ".join([span["text"] for span in line["spans"]]) for line in b["lines"]]) + + if len(text.split()) > 100: + chunks = self.semantic_chunking(text) + else: + chunks = [text] + + for chunk in chunks: + text_blocks.append((id, page_num, bbox.x0, bbox.y0, bbox.x1, bbox.y1, chunk)) + + print('here') + + doc.close() + return text_blocks + diff --git a/src/components/backend/pipeline/llm.py b/src/components/backend/pipeline/llm.py new file mode 100644 index 0000000..1ad218d --- /dev/null +++ b/src/components/backend/pipeline/llm.py @@ -0,0 +1,9 @@ +from langchain_openai import ChatOpenAI + +import os, re, json + +class LLM: + def __init__(self, temperature=0.0001): + self.llm = ChatOpenAI(model_name='gpt-4', temperature=temperature) + + \ No newline at end of file diff --git a/src/components/backend/pipeline/pipeline.py b/src/components/backend/pipeline/pipeline.py new file mode 100644 index 0000000..66f3005 --- /dev/null +++ b/src/components/backend/pipeline/pipeline.py @@ -0,0 +1,50 @@ +from components.backend.pipeline.vectorstore import VectorStore +from components.backend.pipeline.llm import LLM +from components.backend.pipeline.document_handler import Document_Handler + +import os, io + +from components.backend.tools.python_interpreter import PythonInterpreter +from components.backend.tools.arxiv_search import ArxivSearch +from components.backend.tools.calculator import Calculator +from components.backend.tools.web_search import WebSearch +from components.backend.tools.rag import RAG + +from langchain.agents import initialize_agent + +os.environ["OPENAI_API_KEY"] = "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" +os.environ['PINECONE_API_KEY'] = "204755b4-f7d8-4aa4-b16b-764e66796cc3" +os.environ["GOOGLE_API_KEY"] = "AIzaSyDKxAadUfBZ9oAMDlRjRe0jlp3N0oZKqvg" +os.environ["GOOGLE_CSE_ID"] = "57d010b1a25ce48c0" + +class Pipeline: + def __init__(self, max_iterations=5): + self.document_handler = Document_Handler() + self.llm = LLM() + self.vectorstore = VectorStore() + self.rag = RAG(llm=self.llm.llm, vectorstore=self.vectorstore.vectorstore) + self.tools = [ + PythonInterpreter(llm=self.llm.llm).initialize(), + ArxivSearch().initialize(), + Calculator(llm=self.llm.llm).initialize(), + WebSearch(llm=self.llm.llm, vectorstore_public=self.vectorstore.vectorstore).initialize(), + ] + + self.agent = initialize_agent(self.tools, + self.llm.llm, + agent="chat-conversational-react-description", + verbose=True, + handle_parsing_errors=True, + max_iterations=max_iterations + ) + + def run(self, query, chat_history): + return self.agent.invoke({'input': query.strip(), 'chat_history': chat_history}) + + def add(self, pdf): + self.vectorstore.add(self.document_handler(pdf)) + print('Done') + return 1 + + def get_sources(self, query): + return self.vectorstore.get_sources(query) \ No newline at end of file diff --git a/src/components/backend/pipeline/vectorstore.py b/src/components/backend/pipeline/vectorstore.py new file mode 100644 index 0000000..4c2a471 --- /dev/null +++ b/src/components/backend/pipeline/vectorstore.py @@ -0,0 +1,50 @@ +import chromadb +from langchain_community.vectorstores import Chroma +from langchain_openai import OpenAIEmbeddings +import chromadb.utils.embedding_functions as embedding_functions +import os +import pandas as pd +import uuid + +class VectorStore: + def __init__(self): + self.chroma_client = chromadb.Client() + openai_ef = embedding_functions.OpenAIEmbeddingFunction( + api_key=os.environ["OPENAI_API_KEY"], + model_name="text-embedding-ada-002" + ) + self.collection = self.chroma_client.get_or_create_collection(name="user", embedding_function=openai_ef) + self.embeddings_model = OpenAIEmbeddings() + + self.vectorstore = Chroma( + client=self.chroma_client, + collection_name="user", + embedding_function=self.embeddings_model, + ) + + def as_retriever(self): + return self.vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3}) + + def add(self, text_blocks): + df = pd.DataFrame(text_blocks, columns=['id', 'page_num', 'xmin', 'ymin', 'xmax', 'ymax', 'text']) + + assert len(set(df['id'])) == 1 + + uuids = [str(uuid.uuid4()) for _ in range(len(df))] + + self.collection.add( + documents=df['text'].tolist(), + metadatas=df[['id', 'page_num', 'xmin', 'ymin', 'xmax', 'ymax', 'text']].to_dict(orient='records'), + ids=uuids + ) + + del df + + return 1 + + + def get_sources(self, query): + return self.collection.query( + query_texts=query, + n_results=3 + )['metadatas'][0] \ No newline at end of file diff --git a/src/components/backend/tools/arxiv_search.py b/src/components/backend/tools/arxiv_search.py new file mode 100644 index 0000000..635270a --- /dev/null +++ b/src/components/backend/tools/arxiv_search.py @@ -0,0 +1,15 @@ +from langchain_community.utilities import ArxivAPIWrapper +from langchain.tools import Tool +from pydantic import BaseModel, Field + + +class ArxivSearch: + def __init__(self): + self.arxiv = ArxivAPIWrapper() + + def initialize(self): + return Tool.from_function( + func=self.arxiv.run, + name="arxiv", + description="useful for when you need to answer research based questions or find scientific documents or papers", + ) \ No newline at end of file diff --git a/src/components/backend/tools/calculator.py b/src/components/backend/tools/calculator.py new file mode 100644 index 0000000..277e123 --- /dev/null +++ b/src/components/backend/tools/calculator.py @@ -0,0 +1,14 @@ +from langchain.chains import LLMMathChain +from langchain.tools import Tool +from pydantic import BaseModel, Field + +class Calculator: + def __init__(self, llm): + self.llm = llm + + def initialize(self): + return Tool.from_function( + func=LLMMathChain.from_llm(llm=self.llm, verbose=True).run, + name="Calculator", + description="useful for when you need to answer questions about math", + ) \ No newline at end of file diff --git a/src/components/backend/tools/python_interpreter.py b/src/components/backend/tools/python_interpreter.py new file mode 100644 index 0000000..86891cf --- /dev/null +++ b/src/components/backend/tools/python_interpreter.py @@ -0,0 +1,40 @@ +from langchain_core.output_parsers import StrOutputParser +from langchain_experimental.utilities import PythonREPL +from langchain.tools import Tool +from langchain_core.prompts import ChatPromptTemplate + +class PythonInterpreter: + def __init__(self, llm): + self.llm = llm + + def _sanitize_output(self, text: str): + _, after = text.split("```python") + return after.split("```")[0] + + def python_interpreter(self, query): + template = """Write some python code to solve the user's problem. + + Return only python code in Markdown format, e.g.: + + ```python + .... + ```""" + prompt = ChatPromptTemplate.from_messages([("system", template), ("human", "{input}")]) + chain = prompt | self.llm | StrOutputParser() | self._sanitize_output | PythonREPL().run + output = chain.invoke({"input": query}) + print("Python interpreter") + print(output) + return output + + def initialize(self): + return Tool.from_function( + func=self.python_interpreter, + name="python_interpreter", + description="""The Python Code Generator Tool is a sophisticated utility designed to craft Python code solutions for a wide array of questions. When provided with a question, this tool leverages advanced algorithms to generate concise and efficient Python code snippets as answers. + + Usage Instructions: + + Pose a question requiring a Python code solution. + If existing tools are deemed insufficient for the task, instruct the Assistant to utilize the Python Code Generator Tool. + Expect a response in the form of a Markdown-formatted Python code block, enclosed within triple backticks.""", + ) \ No newline at end of file diff --git a/src/components/backend/tools/rag.py b/src/components/backend/tools/rag.py new file mode 100644 index 0000000..56de7af --- /dev/null +++ b/src/components/backend/tools/rag.py @@ -0,0 +1,17 @@ +from langchain.chains import RetrievalQA +from langchain.tools import Tool + +class RAG: + def __init__(self, llm, vectorstore): + self.llm = llm + self.vectorstore = vectorstore + + def run(self, query): + retrieval_qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", + retriever=self.vectorstore.as_retriever()) + + answer = retrieval_qa.invoke(query) + + del retrieval_qa + + return answer diff --git a/src/components/backend/tools/web_search.py b/src/components/backend/tools/web_search.py new file mode 100644 index 0000000..6e06184 --- /dev/null +++ b/src/components/backend/tools/web_search.py @@ -0,0 +1,22 @@ +from langchain.tools import Tool +from langchain.chains import RetrievalQAWithSourcesChain +from langchain.retrievers.web_research import WebResearchRetriever +from langchain_community.utilities import GoogleSearchAPIWrapper + +class WebSearch: + def __init__(self, llm, vectorstore_public): + self.llm = llm + self.search = GoogleSearchAPIWrapper() + self.web_retriever = WebResearchRetriever.from_llm( + vectorstore=vectorstore_public, + llm=self.llm, + search=self.search, + num_search_results=3 + ) + + def initialize(self): + return Tool.from_function( + func=RetrievalQAWithSourcesChain.from_chain_type(llm=self.llm, retriever=self.web_retriever), + name="web_QA", + description="web_QA is a web searching tool for the LLM agent, triggered when the similarity score from in-context QA is too low. It dynamically integrates the LLM and a web retriever to broaden knowledge through targeted web searches, enhancing the agent's responsiveness and adaptability to diverse user queries", + ) \ No newline at end of file diff --git a/src/components/frontend/chat.py b/src/components/frontend/chat.py new file mode 100644 index 0000000..d356933 --- /dev/null +++ b/src/components/frontend/chat.py @@ -0,0 +1,202 @@ +import streamlit as st +import os, re, json +import base64 +import extra_streamlit_components as stx +from annotated_text import annotated_text +import datetime +from langchain_core.messages import AIMessage, HumanMessage +import fitz, io +from PIL import Image, ImageDraw + +@st.cache_resource(experimental_allow_widgets=True) +def get_manager(): + return stx.CookieManager() + +class CookieManager: + def __init__(self, cookie_name = 'messages'): + self.manager = get_manager() + self.cookie_name = cookie_name + + def __call__(self): + _ = self.manager.get_all() + + def get(self): + return self.manager.get(cookie=self.cookie_name) + + def set(self, value): + self.manager.set(self.cookie_name, value) + + def delete(self): + self.manager.delete(cookie=self.cookie_name) + +class Chat_UI: + def __init__(self, pipeline): + self.pipeline = pipeline + self.cookie_manager = CookieManager() + + def render(self): + self.chat() + + def initiate_memory(self): + if len(st.session_state['messages']) < 2: + history = self.get_messages() + + if not history: + st.session_state['messages'] = [{"role": "assistant", "content": "Hello! The name's euGenio. I'm here to help you with your pipeline. Ask me a question!"}] + else: + st.session_state['messages'] = history + + def append(self, message:dict): + st.session_state['messages'].append(message) + + def __call__(self): + self.cookie_manager() + # Instantiates the chat history + self.initiate_memory() + self.load_memory() + + # Load's the text tab + self.load_chatbox() + + def load_chatbox(self): + user_input = st.text_input("*Got a question?*", help='Try to specify keywords and intent in your question!', key="text", on_change=self.handle_query) + + if st.button('Delete History', use_container_width=True, type='primary'): + self.delete_messages() + + def load_memory(self): + messages = st.session_state['messages'] + print(messages) + if messages: + with st.spinner('Loading Memory...'): + for message in messages : + role = message["role"] + content = message["content"] + + with st.chat_message(role): + if type(content) == dict and role == 'assistant': + if 'images' in content: + with st.expander("Thought Process!", expanded=True): + st.json({key: value for key, value in content.items() if key != 'images'}) + with st.expander("Sources!", expanded=False): + with open('./images.txt', 'r') as f: + vals = f.read() + st.markdown(vals, unsafe_allow_html=True) + else: + with st.expander("Thought Process!", expanded=True): + st.json(content) + + else: + st.markdown(content) + + def format_history(self): + messages = st.session_state['messages'] + + if messages: + formatted = [] + for message in messages[1:]: + if message['role'] == 'user': + formatted.append(HumanMessage(content=str(message['content']))) + else: + formatted.append(AIMessage(content=str(message['content']))) + return formatted + else: + return [] + + def handle_query(self): + text = st.session_state["text"] + st.session_state["text"] = "" + + user_message = {"role": "user", "content": text} + self.append(user_message) + + with st.chat_message("user"): + st.markdown(text) + + with st.chat_message("assistant"): + idx, tool = 0, None + + with st.expander("Thought Process!", expanded=True): + with st.spinner('Thinking...'): + if st.session_state['documents']: + results = self.pipeline.rag.run(query=text) + answer = results['result'] + st.markdown(answer) + else: + results = self.pipeline.run(query=text, chat_history=self.format_history()) + answer = results['output'] + st.json(answer) + + if st.session_state['documents']: + with st.expander("Sources!", expanded=True): + vals = self._generate_images(self.pipeline.get_sources(query=f"{text}:{answer}")) + st.markdown(vals) + with open('./images.txt', 'w') as f: + f.write(vals) + results['images'] = 'Present' + + print(results) + idx += 1 + + assistant_message = {"role": "assistant", "content": {key: value for key, value in results.items() if key != 'chat_history'}} + + self.append(assistant_message) + self.store_messages(user_message, assistant_message) + + def store_messages(self, user_message, assistant_message): + past = self.cookie_manager.get() + + if past: + print(assistant_message) + if user_message not in past and assistant_message not in past: + past.append(user_message) + past.append(assistant_message) + self.cookie_manager.set(past) + else: + self.cookie_manager.set(st.session_state.messages) + + def get_messages(self): + return self.cookie_manager.get() + + def delete_messages(self): + self.cookie_manager.delete() + self.initiate_memory() + + def _generate_images(self, documents): + images = [] + + for document in documents: + images += [self.highlight_bbox_in_pdf(document['id'], document['page_num'], (document['xmin'], document['ymin'], document['xmax'], document['ymax']))] + + images_markdown = self.create_markdown_with_images(images) + + return images_markdown + + def highlight_bbox_in_pdf(self, pdf_path, page_number, bbox): + doc = fitz.open(pdf_path) + page = doc.load_page(page_number) + + pix = page.get_pixmap() + img = Image.open(io.BytesIO(pix.tobytes())) + + draw = ImageDraw.Draw(img) + draw.rectangle(bbox, outline="red", width=2) + + img_buffer = io.BytesIO() + img.save(img_buffer, format="PNG") # Save the modified image to the buffer + encoded_image = base64.b64encode(img_buffer.getvalue()).decode("utf-8") + + doc.close() + + return encoded_image + + def create_markdown_with_images(self, images): + images_html = "" + for base64_image in images: + img_html = f'' + images_html += img_html + "
" + return images_html + +class CookieTester: + def __init__(self): + self.cookie = None \ No newline at end of file diff --git a/src/components/frontend/sidebar.py b/src/components/frontend/sidebar.py new file mode 100644 index 0000000..1d7e957 --- /dev/null +++ b/src/components/frontend/sidebar.py @@ -0,0 +1,110 @@ +import streamlit as st +import base64, os +import time +import io + +class Sidebar: + def __init__(self, pipeline): + self.pipeline = pipeline + + def __call__(self): + with st.sidebar: + st.markdown( + """ + + """, unsafe_allow_html=True + ) + + + st.image('assets/eugenie.png', width=250) + + disabled = True + if 'api_key' not in st.session_state: + disabled = False + key = st.sidebar.text_input('', placeholder ='Input your OpenAI API Key: ', type='password', label_visibility='hidden', key='api_key_input', disabled=disabled) + if key: + st.session_state['api_key'] = key + st.sidebar.success('API Key Successfully Added!') + st.sidebar.divider() + + self._upload_widget() + self._show_tools() + + def _upload_widget(self): + + upload_expander = st.sidebar.expander("File Uploader", expanded=True) + with upload_expander: + pdf_docs = st.file_uploader('Select Files to Upload', accept_multiple_files=True, type=['pdf', 'txt', 'png', 'jpg']) + if st.button('Start Upload'): + for pdf in pdf_docs: + file_details = {'Filename': pdf.name, 'FileType': pdf.type, 'FileSize': pdf.size} + + progress_text = 'Checking File...' + my_bar = st.progress(0, text=progress_text) + percent_complete = 0 + + if pdf.type == "application/pdf": + percent_complete += 20 + my_bar.progress(percent_complete, text=progress_text) + progress_text = 'Processing File...' + + with open('./' + pdf.name, 'wb') as f: + f.write(pdf.read()) + + percent_complete += 30 + my_bar.progress(percent_complete, text=progress_text) + progress_text = 'Processing File...' + + status = self.pipeline.add('./' + pdf.name) + + print(status) + + percent_complete += 50 + my_bar.progress(percent_complete, text="Finalizing...") + st.success(f'File Successfully Processed!') + my_bar.empty() + del my_bar + + st.session_state['documents'] = True + + def _show_tools(self): + tools = st.sidebar.expander("Tools", expanded=True) + with tools: + options = st.selectbox( + 'List of available tools: ', + ('Chroma-DB', 'Web-Search', 'arXiv-Search', 'Calculator-App', 'Python-Interpreter')) + + if options == 'Chroma-DB': + st.markdown(""" + ## Chroma-DB + Chroma HTTP.Client object class can be used to retrieve documents with metadata based on a corresponding query embedding + """) + elif options == 'Web-Search': + st.markdown(""" + ## Web-Search + A module which can search the web and just return the results + """) + elif options == 'arXiv-Search': + st.markdown(""" + ## arXiv-Search + A module which can search arXiv's research repository with abstracts, papers, and authors. + """) + elif options == 'Calculator-App': + st.markdown(""" + ## Calculator-App + A module to which can you send in a formula in the form of a string + """) + elif options == 'Python-Interpreter': + st.markdown(""" + ## Python-Interpreter + A module to which can you send in code as a string with delimiters, and get output back + """) \ No newline at end of file