forked from TeeOhh/tRECS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
368 lines (305 loc) · 15.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import gensim
from gensim import corpora, models, similarities
from operator import itemgetter
import operator
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import spacy
def termdocument_matrix(data_frame):
# ''' Description: This function takes in a dataframe of names and descriptions, and returns a doc/term frequency matrix
# Parameters: a dataframe of names and descriptions
# Output: m by n matrix where m=descriptions and n=words in descriptions '''
descriptions = data_frame.iloc[:, 1].tolist()
descriptions = [unicode(str(dscr), 'utf-8').split(" ") for dscr in descriptions]
dictionary = gensim.corpora.Dictionary(descriptions)
corpus = [dictionary.doc2bow(text) for text in descriptions]
numpy_array = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary))
return pd.DataFrame(numpy_array), dictionary.token2id
def make_binary(int):
# '''Description: This function converts integers to either 0 if integer does not appear or 1 if integer appears
# Parameters: integers (frequency of words)
# Output: string of 1s and 0s'''
if int > 0:
return 1
else:
return 0
def get_num_of_topics(dataframe):
# '''Description: This function takes in a dataframe of names and descriptions, and returns the optimum number of topics for use by gnesim's lsa model
# Parameters: a dataframe of names and descriptions
# Output: integer (optimum number of topics for lsa)'''
frequency_matrix, bag_of_words = termdocument_matrix(dataframe)
## Sort bag of words by their token2id ids
sorted_bag_of_words = sorted(bag_of_words.items(), key=operator.itemgetter(1))
## Set columns of matrix = bag of words
frequency_matrix = frequency_matrix.T
frequency_matrix.columns = [word[0] for word in sorted_bag_of_words]
## Set indexes of matrix = course names
frequency_matrix.index = dataframe[dataframe.columns[0]].tolist()
# n = number of terms
n=len(frequency_matrix.columns)
# m = course names how many courses there are
m=len(frequency_matrix.index)
binary_matrix = frequency_matrix.applymap(make_binary)
t=np.count_nonzero(binary_matrix)
return m*n/t
def build_freq_matrix(dataframe):
# '''Description: This function takes in a dataframe of names and descriptions and return matrix of number of documents a term occurs in
# Parameters: a dataframe of names and descriptions
# Output: m by 1 frequency matrix where m=number of documents where each word occurs'''
frequency_matrix, bag_of_words = termdocument_matrix(dataframe)
## Sort bag of words by their token2id ids
sorted_bag_of_words = sorted(bag_of_words.items(), key=operator.itemgetter(1))
## Set columns of matrix = bag of words
frequency_matrix = frequency_matrix.T
frequency_matrix.columns = [word[0] for word in sorted_bag_of_words]
## Set indexes of matrix = course names
frequency_matrix.index = dataframe[dataframe.columns[0]].tolist()
## Make the frequency matrix a binary matrix for document frequency
binary_matrix = frequency_matrix.applymap(make_binary)
## Sum down and build word/document frequency matrix
doc_frequency = binary_matrix.apply(sum).T
doc_frequency = doc_frequency.to_frame().reset_index()
doc_frequency = doc_frequency.rename(columns={'index' : 0, 0 : 1})
doc_frequency.sort_values(by=1, inplace=True, ascending=False)
return doc_frequency
def build_word_freq(dataframe):
# '''Description: This function takes in a dataframe of names and descriptions and returns a matrix of each word's total number of occurences accross all documents
# Parameters: a dataframe of names and descriptions
# Output: m by 1 matrix where m=each word's frequency accross all documents'''
word_freq_df = pd.DataFrame()
## Build column of words and column of their corresponding occurences across all documents
word_freq_df['Word'] = pd.Series(column for column in dataframe)
word_freq_df['Frequency'] = pd.Series(sum(dataframe[column].values) for column in dataframe)
## Set index as the words and sort by the occurences (most occurences at top and least at bottom)
word_freq_df.set_index('Word', inplace=True)
word_freq_df.sort_values(by='Frequency', inplace=True, ascending=False)
return word_freq_df
## ------- START OF CLEANING METHODS -------
def master_clean(df, column, html, email, punc, non_ascii, stopwords, number, remove_nonenglish, stemorlem):
if punc:
df[column] = df[column].apply(remove_punc).to_frame()
if html:
df[column] = df[column].apply(remove_html).to_frame()
if email:
df[column] = df[column].apply(remove_email).to_frame()
if non_ascii:
df[column] = df[column].apply(remove_non_ascii).to_frame()
if stopwords:
df[column] = df[column].apply(remove_stop).to_frame()
if number:
df[column] = df[column].apply(remove_numbers).to_frame()
if nonenglish:
df[column] = df[column].apply(nonenglish).to_frame()
if stemorlem == 'stem':
df[column] = df[column].apply(stemmer).to_frame()
elif stemorlem == 'lem':
df[column] = df[column].apply(lemmatizer).to_frame()
return df
def remove_punc(string):
# '''Description: This function takes in a string of descriptions and return a tokenized string without punctuation
# Parameters: String of descriptions
# Output: Tokenized string with punctuation removed'''
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(string)
return " ".join(tokens)
def remove_non_ascii(text):
# '''Description: This function takes in a string of descriptions and return the string without non ASCII characters
# Parameters: String of descriptions
# Output: the string without non ASCII characters over 127'''
stripped = (unicode(str(c), 'utf-8') for c in text if 0 < ord(c) < 127)
return ''.join(stripped)
def remove_stop(string):
# '''Description: This function takes in a string of descriptions and return the string without stopwords
# Parameters: String of descriptions
# Output: String with stopwords removed (ex. a, and, the, etc)'''
words = unicode(str(string), 'utf-8').split(" ")
stop_words=[word.encode('utf-8') for word in stopwords.words('english')]
filtered_words = [word for word in words if not word.lower() in stop_words]
return " ".join(filtered_words)
def remove_email(text):
# '''Description: This function takes in the string of descriptions and return the string without emails
# Parameters: String of descriptions
# Output: string with all emails removed'''
match = re.compile('[^\s]+\\@(\\[?)[a-zA-Z0-9\\-\\.]+\\.([a-zA-Z]{2,3}|[0-9]{1,3})(\\]?)')
return re.sub(match, '', text)
def remove_html(first_text):
# '''Description: This function takes in the string of descriptions and return the string with html code removed
# Parameters: String of descriptions
# Output: String with all html tags removed'''
clean = re.compile('<.*?>')
second_text = re.sub(clean, '', first_text)
second_clean = re.compile(' ')
return re.sub(second_clean, '', second_text)
def rebuild_freq_matrix(freq_matrix):
# '''Description: This function takes in the document frequency matrix and return matrix with words that occur in only one document removed
# Parameters: Document frequency matrix
# Output: Document frequency matrix with words that only occur in one document removed'''
new_freq_matrix = freq_matrix.where(freq_matrix[1] != 1).dropna()
return new_freq_matrix
def remove_occur_once(string, freq_matrix):
# '''Description: This function takes in the document frequency matrix and return a string with words that occur in only one document removed"
# Parameters: String of descriptions
# Output: String with words removed that only occur in one document'''
remove_words = freq_matrix.where(freq_matrix[1] != 1).dropna()[0].tolist()
lis = unicode(str(string), 'utf-8').split(' ')
words = [word for word in lis if word in remove_words]
return " ".join(words)
def stemmer(text):
# '''Description: This function takes in the string of descriptions and return string with all words stemmed
# Parameters: String of descriptions
# Output: String with all words stemmed (ex. "meeting" and "meetings" to "meeting")'''
stemmer = PorterStemmer()
lis = unicode(str(text), 'utf-8').split(" ")
stemmed_words = [str(stemmer.stem(word)) for word in lis]
return " ".join(stemmed_words)
def lemmatizer(text):
# '''Description: This function takes in the string of descriptions and return string with all words lemmatized
# Parameters: String of descriptions
# Output: String with all words lemmatized (ex. "meeting" to "meeting" if noun and "meet" if verb)'''
lemmatizer = WordNetLemmatizer()
lis = unicode(str(text), 'utf-8').split(" ")
lemm_words = [lemmatizer.lemmatize(word) for word in lis]
return " ".join(lemm_words)
def remove_cutoff(text, keep_words):
# '''Description: Thiis function takes in the string of descriptions and return string with only the words from the list of words to keep in the cuttoff slider
# Parameters: String and a list of words to keep
# Output: string of words with only words from list of words to keep'''
lis = unicode(str(text), 'utf-8').split(" ")
words = [word for word in lis if word in keep_words]
if len(words) > 0:
return " ".join(words)
else:
return ''
def remove_numbers(text):
# '''Description: This function takes in the string of descriptions and return the string without numbers (useful for course syllabi)
# Parameters: String of descriptions
# Output: the string without numbers'''
text = str(text)
match = re.compile('\d')
return re.sub(match, '', text)
def nonenglish(string):
# '''Description: This function takes in the string of descriptions and return the string with nonenglish words removed (useful for course syllabi)
# Parameters: String of descriptions
# Output: the string with nonenglish words removed'''
words = set(nltk.corpus.words.words())
result=[w for w in nltk.wordpunct_tokenize(string) if w.lower() in words]
return " ".join(result)
def ensemble(list_of_recommendations, amount):
# '''Description: This function takes in the ranked list of recommendations from each model the user chooses (LSA, LDA, spacy, TF IDF) and output a ranked list of recommendations where the highest score from the sum of each model's ranked score is ranked as most similar.
# Parameters: Ordered list of recommendations from LSA, LDA, spacy, and TF IDF
# Output: List of recommendations in order'''
d = {}
for lis in list_of_recommendations:
reverse=lis[::-1]
newlist=list(enumerate(reverse, start=1))
for pair in newlist:
if pair[1] not in d:
d[pair[1][0]] = pair[0]
else:
d[pair[1][0]] += pair[0]
d = sorted(d.items(), key=operator.itemgetter(1))
return [course for course in d][::-1][:amount]
def get_similar(course_name, amount, model_tuple):
# '''Description: This function takes in a name and model ((LSA, TF IDF) and return a ranked dataframe where the first entry is the most similar description and the mth entry is the least similiar description, m=amount of descriptions the user chooses
# Parameters: name, amount, model_tuple
# Output: dataframe of similar descriptions ranked by similarity where the first is most similar'''
index = model_tuple[0]
matrix = model_tuple[1]
input_doc = matrix.loc[course_name]
index.num_best = amount + 1
top = index[input_doc]
return get_names(matrix, top)
def get_similar_lda(course_name, amount, model_tuple):
# '''Description: This function takes in a description and the LDA model returns a ranked dataframe where the first entry is the most similar description and the mth entry is the least similiar description, m=amount of descriptions the user chooses
# Parameters: name, amount, model_tuple
# Output: dataframe of similar descriptions ranked by similarity where the first is most similar'''
index = model_tuple[0]
matrix = model_tuple[1]
input_doc = matrix.loc[course_name]
index.num_best = amount + 1
top = index[input_doc]
top = top[0]
return get_names(matrix, top)
def get_similar_spacy(course_name, amount, df):
# '''Description: The function takes in a description and the dataframe of course name and the spacy model returns a ranked dataframe where the first entry is the most similar description and the mth entry is the least similiar description, m=amount of descriptions the user chooses
# Parameters: name, amount, data frame of names
# Output: dataframe of similar descriptions ranked by similarity where the first is most similar'''
input_doc= df[course_name]
top = {k: input_doc.similarity(v) for k, v in df.items() if len(v) > 0}
top = sorted(top.items(), key=operator.itemgetter(1), reverse = True)
top = top[:amount + 1]
return top
def get_names(df, lis):
# '''Description: This function takes in a dataframe/list from the get_similar function and returns a dataframe where similarities are indexed by the course name
# Parameters: dataframe of similarity score, list of names
# Output: dataframe where similarities are indexed by the course name'''
named = [(df.index[couple[0]], couple[1]) for couple in lis]
return named
def concat_descr(column):
# '''Description: This function takes in the columns from the dataframe and concatenates them for the spacy model
# Parameters: 2+ decription columns from dataframe
# Output: concatenated string of descriptions '''
dscr = ''
for d in column:
d = d.decode('utf-8')
dscr += d
dscr += ' '
return dscr
def get_all_entities(document):
# '''Description: This function converts a document to a dictionary of entity labels and a list of the entities (of that label) found in the document
# Parameters: String of descriptions
# Output: A dictionary in form: entity label: [word1, word2, ....]'''
nlp = spacy.load('en')
entities = []
doc = nlp(document)
d = {}
for ent in doc.ents:
label = ent.label_
text = ent.text
if label in d:
d[label].append(text)
else:
d[label] = [text]
return d
def build_ent_dic(entity_dic):
# '''Description: This function takes a dictionary of entities and their words found in the data set and converts it to a new dictionary of dictionaries that contains the amount of times each word occurs in the dataset, for each category of entity
# Params: Dictionary of form: entity : [word1, word2, ...]
# Output: Dictionary of dictionaries in form: Entity label : { Word1 : occur, word2: occur }'''
d = {}
for label, texts in entity_dic.items():
if label not in d:
d[label] = {}
for word in texts:
if word not in d[label]:
d[label][word] = 1
else:
d[label][word] += 1
return d
def breakdown_ents(label, d):
# '''Description: This funcitons takes in a category label of entity and returns the data for a pie chart that breaks down the words of that category of entity.
# Parameters: The label of entity and the dictionary of entity occurence counts from the above function
# Output: The data needed for the pie chart'''
dic = d[label]
x = [word for word, amount in dic.items()]
y = [amount for word, amount in dic.items()]
pie_data = [go.Pie(labels=x, values=y, textinfo='percent', name=label)]
return pie_data
def get_dataframe(label, d):
# '''Description: This functions builds a dataframe of words and their occurence counts for a particular entity
# Parameters: The label of entity and the dictionary of entity occurence counts from the build_ent_dic function
# Output: A dataframe of words and their occurence counts for the desired entity'''
df = pd.DataFrame(d)
df = df.T
df = df.loc[label].dropna().to_frame()
df = df.reset_index()
df = df.rename(columns={"index": "Entities", label: "Counts",})
df = df.sort_values(by='Counts', ascending=False)
return df