1+ import sqlite3
2+ import json
3+ import pandas as pd
4+ import sklearn
5+ from sklearn .feature_extraction .text import TfidfVectorizer
6+
7+ class QuestionAnswerVirtualAssistant :
8+ """
9+ Used for automatic question-answering
10+
11+ It works by building a reverse index store that maps
12+ words to an id. To find the indexed questions that contain
13+ a certain the words in the user question, we then take an
14+ intersection of the ids, ranks the questions to pick the best fit,
15+ then select the answer that maps to that question
16+ """
17+
18+ def __init__ (self ):
19+ """
20+ Returns - None
21+ Input - None
22+ ----------
23+ - Initialize database. we use sqlite3
24+ - Check if the tables exist, if not create them
25+ - maintain a class level access to the database
26+ connection object
27+ """
28+ self .conn = sqlite3 .connect ("virtualassistant.sqlite3" , autocommit = True )
29+ cur = self .conn .cursor ()
30+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToQuesAns'" )
31+ tables_exist = res .fetchone ()
32+
33+ if not tables_exist :
34+ self .conn .execute ("CREATE TABLE IdToQuesAns(id INTEGER PRIMARY KEY, question TEXT, answer TEXT)" )
35+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
36+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
37+
38+ def index_question_answer (self , question , answer ):
39+ """
40+ Returns - string
41+ Input - str: a string of words called question
42+ ----------
43+ Indexes the question and answer. It does this by performing two
44+ operations - add the question and answer to the IdToQuesAns, then
45+ adds the words in the question to WordToId
46+ - takes in the question and answer (str)
47+ - passes the question and answer to a method to add them
48+ to IdToQuesAns
49+ - retrieves the id of the inserted ques-answer
50+ - uses the id to call the method that adds the words of
51+ the question to the reverse index WordToId if the word has not
52+ already been indexed
53+ """
54+ row_id = self ._add_to_IdToQuesAns (question .lower (), answer .lower ())
55+ cur = self .conn .cursor ()
56+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
57+ reverse_idx = json .loads (reverse_idx )
58+ question = question .split ()
59+ for word in question :
60+ if word not in reverse_idx :
61+ reverse_idx [word ] = [row_id ]
62+ else :
63+ if row_id not in reverse_idx [word ]:
64+ reverse_idx [word ].append (row_id )
65+ reverse_idx = json .dumps (reverse_idx )
66+ cur = self .conn .cursor ()
67+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
68+ return ("index successful" )
69+
70+ def _add_to_IdToQuesAns (self , question , answer ):
71+ """
72+ Returns - int: the id of the inserted document
73+ Input - str: a string of words called `document`
74+ ---------
75+ - use the class-level connection object to insert the document
76+ into the db
77+ - retrieve and return the row id of the inserted document
78+ """
79+ cur = self .conn .cursor ()
80+ res = cur .execute ("INSERT INTO IdToQuesAns (question, answer) VALUES (?, ?)" , (question , answer ,))
81+ return res .lastrowid
82+
83+ def find_questions (self , user_input ):
84+ """
85+ Returns - <class method>: the return value of the _find_questions_with_idx method
86+ Input - str: a string of words called `user_input`, expected to be a question
87+ ---------
88+ - retrieve the reverse index
89+ - use the words contained in the user input to find all the idxs
90+ that contain the word
91+ - use idxs to call the _find_questions_with_idx method
92+ - return the result of the called method
93+ """
94+ cur = self .conn .cursor ()
95+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
96+ reverse_idx = json .loads (reverse_idx )
97+ user_input = user_input .split (" " )
98+ all_docs_with_user_input = []
99+ for term in user_input :
100+ if term in reverse_idx :
101+ all_docs_with_user_input .append (reverse_idx [term ])
102+
103+ if not all_docs_with_user_input : # the user_input does not exist
104+ return []
105+
106+ common_idx_of_docs = set (all_docs_with_user_input [0 ])
107+ for idx in all_docs_with_user_input [1 :]:
108+ common_idx_of_docs .intersection_update (idx )
109+
110+ if not common_idx_of_docs : # the user_input does not exist
111+ return []
112+
113+ return self ._find_questions_with_idx (common_idx_of_docs )
114+
115+ def _find_questions_with_idx (self , idxs ):
116+ """
117+ Returns - list[str]: the list of questions with the idxs
118+ Input - list of idxs
119+ ---------
120+ - use the class-level connection object to retrieve the questions that
121+ have the idx in the input list of idxs.
122+ - retrieve and return these questions as a list
123+ """
124+ idxs = list (idxs )
125+ cur = self .conn .cursor ()
126+ sql = "SELECT id, question, answer FROM IdToQuesAns WHERE id in ({seq})" .format (
127+ seq = ',' .join (['?' ]* len (idxs ))
128+ )
129+ result = cur .execute (sql , idxs ).fetchall ()
130+ return (result )
131+
132+ def find_most_matched_question (self , user_input , corpus ):
133+ """
134+ Returns - list[str]: the list of [(score, most_matching_question)]
135+ Input - user_input, and list of matching questions called corpus
136+ ---------
137+ - use the tfidf score to rank the questions and pick the most matching
138+ question
139+ """
140+ vectorizer = TfidfVectorizer ()
141+ tfidf_scores = vectorizer .fit_transform (corpus )
142+ tfidf_array = pd .DataFrame (tfidf_scores .toarray (),columns = vectorizer .get_feature_names_out ())
143+ tfidf_dict = tfidf_array .to_dict ()
144+
145+ user_input = user_input .split (" " )
146+ result = []
147+ for idx in range (len (corpus )):
148+ result .append ([0 , corpus [idx ]])
149+
150+ for term in user_input :
151+ if term in tfidf_dict :
152+ for idx in range (len (result )):
153+ result [idx ][0 ] += tfidf_dict [term ][idx ]
154+ return result [0 ]
155+
156+ def provide_answer (self , user_input ):
157+ """
158+ Returns - str: the answer to the user_input
159+ Input - str: user_input
160+ ---------
161+ - use the user_input to get the list of matching questions
162+ - create a corpus which is a list of all matching questions
163+ - create a question_map that maps questions to their respective answers
164+ - use the user_input and corpus to find the most matching question
165+ - return the answer that matches that question from the question_map
166+ """
167+ matching_questions = self .find_questions (user_input )
168+ corpus = [item [1 ] for item in matching_questions ]
169+ question_map = {question :answer for (id , question , answer ) in matching_questions }
170+ score , most_matching_question = self .find_most_matched_question (user_input , corpus )
171+ return question_map [most_matching_question ]
172+
173+
174+ if __name__ == "__main__" :
175+ va = QuestionAnswerVirtualAssistant ()
176+ va .index_question_answer (
177+ "What are the different types of competitions available on Kaggle" ,
178+ "Types of Competitions Kaggle Competitions are designed to provide challenges for competitors"
179+ )
180+ print (
181+ va .index_question_answer (
182+ "How to form, manage, and disband teams in a competition" ,
183+ "Everyone that competes in a Competition does so as a team. A team is a group of one or more users"
184+ )
185+ )
186+ va .index_question_answer (
187+ "What is Data Leakage" ,
188+ "Data Leakage is the presence of unexpected additional information in the training data"
189+ )
190+ va .index_question_answer (
191+ "How does Kaggle handle cheating" ,
192+ "Cheating is not taken lightly on Kaggle. We monitor our compliance account"
193+ )
194+ print (va .provide_answer ("state Kaggle cheating policy" ))
195+ print (va .provide_answer ("Tell me what is data leakage" ))
0 commit comments