-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbookerReaderModule.py
136 lines (108 loc) · 5.68 KB
/
bookerReaderModule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from langchain.evaluation.qa import QAEvalChain
import config
from indexing.ContenLoaderModule import ContentLoader
from indexing.DocumentTransformerModule import DocumentTransformer
from indexing.chromadbModule import ChromadbClient
from indexing.embeddings import EmbeddingUtility
from chains.QnAChain import QnAChain
from chains.evaluationModule import qa_evaluation_chain
from typing import List
import re
openai_api_key = config.OPENAI_API_KEY
llm_model = config.LLM_MODEL
collection_name = config.COLLECTION_NAME
class BookerReaderModule():
def __init__(self):
"""
First it will initiate the DB and check if collection and data doesnot exist then it will:
- Create collections
- Load documents
- Create embedding
- Then store data and embeddings in collections
"""
self.database = ChromadbClient()
self.embeddings = EmbeddingUtility()
# self.chain = QAChain()
def check_if_url_contains(self,query):
"""
This is a Utility function to check if a message contains URL.
Returns:
- Returns String value as "True" if URL exists in the query.
- Returns String value as "False" if URL does not exist.
Example:
bookReader.check_if_url_contains(query)
"""
regex = re.compile(
r'^https?://|' # http:// or https://
# domain...
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
for item in query.split(' '):
contains_url = re.match(regex,item)
if contains_url:
return contains_url.string.lower()
return None
def process_user_query(self, query):
# qVDBObj = QdrantVectorDB(
# config.DB_CONNECTION_URL, config.DB_CONNECTION_PORT, config.COLLECTION_NAME)
# qdClient = qVDBObj.initializeClient()
# qdbConnection = qVDBObj.connectDB(qdClient)
# docStore = qVDBObj.querySimilaritySearchbyVector(
# qdbConnection, query)
self.database.get_or_create_colletion(self.embeddings, "49ers34-31")
print("count ->",self.database.get_colletion_items_count())
docStore = self.database.query_data(query)
print("Here is the response from DB:\n", docStore)
# invoke Q&A chain
mychain = QnAChain(type="stuff")
response = mychain.invokeChain(docStore, query)
# print(response)
print(response.get('output_text'))
return response.get('output_text')
def preProcessDataIndexing(self, fileType, fileURL):
# Loading the data from PDF file.
data = ContentLoader.loadFile(fileType=fileType, fileURL=fileURL)
# Using transformer to split
texts = DocumentTransformer.text_splitter(data)
# Create Qdrant Collection for Uploading Book ( First time execution only.)
# docStore = QdrantVectorDB.uploadDataFromScratch(texts)
docStore = self.uploadDataFromScratch(texts)
return docStore
def uploadDataFromScratch(self,texts):
self.database.get_or_create_colletion(self.embeddings, "49ers34-31")
print("current Items in collection",self.database.get_colletion_items_count())
if not self.check_if_data_exist():
print("current Items in collection",self.database.get_colletion_items_count())
text_doc_list = [text.page_content for text in texts]
# print(text_doc_list)
embeddings = self.create_embeddings(text_doc_list)
# print("embeddings->", embeddings)
self.store_data_and_embeddings(embeddings, text_doc_list)
print("count ->",self.database.get_colletion_items_count())
# print(int("49ers34-31"))
def check_if_data_exist(self):
"""
This functions checks the count of items in the collection
return: if 0 then False otherwise True
"""
result = False if self.database.get_colletion_items_count() == 0 else True
print("IF DB EXIST OR NOT: ", result)
# return result
return False
def create_embeddings(self, text_doc_list: List[str]):
"""
This function create the embeddings against the list provided
args: text_doc_list: List[str]
return: embeddings of list of documents:str
"""
return self.embeddings.embed_list_of_documents(text_doc_list)
def store_data_and_embeddings(self, embeddings, text_document_list):
"""
This functions stores embeddings and list of strings in collections.
args: embeddings:List[float]
text_document_list: List[str]
"""
self.database.add_data_into_collection(embeddings, text_document_list)