-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
354 lines (313 loc) · 16.2 KB
/
main.py
File metadata and controls
354 lines (313 loc) · 16.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
from Models.logistic_regression import BinaryLogisticRegressionModel
from Models.model import Model
from Models.clustermodel import Kmeans
from data_manipulator_interface import bag_of_words_full_no_empty, tfidf_no_empty, doc2vec_simple, bag_of_words_full_no_empty_val_no_num_no_short_no_repeat
from data_reader import DataReader
from data_manipulator import *
from cached_models import _get_nn_model_bag_of_words_simple_scratch
from cached_models import _get_random_forest_model_bag_of_words_full
from cached_models import _get_naive_bayes_model_bag_of_words_full
from cached_models import _get_multiclass_logistic_regression_model_tfidf
from cached_models import _get_random_forest_model_tfidf
from cached_models import _get_naive_bayes_model_tfidf
from cached_models import _get_multiclass_logistic_regression_model_doc2vec_simple
from cached_models import _get_random_forest_model_doc2vec_simple
from cached_models import _get_naive_bayes_model_doc2vec_simple
from cached_models import _get_multiclass_logistic_regression_model_bag_of_words_full
from supervised_methods import eval_nn, evaluate_model_nn, eval_model, evaluate_model, randomword
from run_autoencoder import get_encoder
import pandas as pd
# Code to get evaluation results
def main():
# Calculates the qualitative method evaluation.
# Evaluate logistic regression, random forest, naive bayes and neural network on a pub med word2vec representation
# eval_pub_med()
# Evaluate logistic regression, random forest, naive bayes and neural network on a autoencoder + bagofwords representation
# eval_ae()
# Evaluate logistic regression, random forest, naive bayes and neural network on a bag of words, tfidf and doc2vec represetnation
# eval()
# Runs the generalizability check on nn and bow
# Results saved to a set of files in output_dir. Files will have random name of 7 characters.
# per_site_accuracy_increase()
# Runs kmeans and gets keywords
top_keywords_kmeans()
pass
def eval_pub_med():
from gensim.models.keyedvectors import KeyedVectors
# Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
# Load the pubmed model
model = KeyedVectors.load_word2vec_format('wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
# Load data into train/validate/test sets
data_reader = DataReader()
df = data_reader.get_all_data()
train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(df)
tokens_train, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False,
remove_empty=True)
# for the each tokenized vector in the train set, run the model on each word and take the average.
# If no words are vectorized by pubmed, append an 0 vector
avg = []
for item in tokens_train:
words = []
for word in item:
if word in model.wv.vocab:
vec = model.get_vector(word)
words.append(vec)
average = np.average(np.array(words), axis=0)
if type(average) == np.float64:
print('****')
print(average)
avg.append(np.zeros(200))
else:
avg.append(list(average))
pub_med_train = np.array(avg)
# run the same for the validation set
tokens_val, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True)
avg = []
for item in tokens_val:
words = []
for word in item:
if word in model.wv.vocab:
vec = model.get_vector(word)
words.append(vec)
average = np.average(np.array(words), axis=0)
if type(average) == np.float64:
print('****')
print(average)
avg.append(np.zeros(200))
else:
avg.append(list(average))
pub_med_val = np.array(avg)
# run the same for the test set
tokens_test, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True)
avg = []
for item in tokens_test:
words = []
for word in item:
if word in model.wv.vocab:
vec = model.get_vector(word)
words.append(vec)
average = np.average(np.array(words), axis=0)
if type(average) == np.float64:
print('****')
print(average)
avg.append(np.zeros(200))
else:
avg.append(list(average))
pub_med_test = np.array(avg)
# train the neural network model and calculate the precision, recall, f1 score, and accuracy
print("pubmed, nn")
nn_model = _get_nn_model_bag_of_words_simple_scratch(pub_med_train, train_y_raw, pub_med_val, val_y_raw,
data_reader.get_region_labels()['Code'], epochs=100, batch_size=256)
eval_nn(nn_model, pub_med_test, test_y_raw)
evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False)
# train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
print("pubmed, logistic regression")
from Models.logistic_regression import MultiClassLogisticRegression
log_reg = MultiClassLogisticRegression()
log_reg.train(pub_med_train, train_y_raw)
eval_model(log_reg, pub_med_test, test_y_raw)
evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False)
# train the random forest model and calculate the precision, recall, f1 score, and accuracy
print("pubmed, random forest")
from Models.random_forest import RandomForest
rand_for = RandomForest()
rand_for.train(pub_med_train, train_y_raw)
eval_model(rand_for, pub_med_test, test_y_raw)
evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False)
# train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
print("pubmed, naivebayes")
from Models.naive_bayes import NaiveBayes
nb = NaiveBayes()
nb.train(pub_med_train, train_y_raw)
eval_model(nb, pub_med_test, test_y_raw)
evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)
def eval_ae():
from Models.logistic_regression import MultiClassLogisticRegression
from Models.random_forest import RandomForest
from Models.naive_bayes import NaiveBayes
from Models.svm import SVM
# load data
data_reader = DataReader()
df = data_reader.get_all_data()
train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(df)
train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
train_x_raw, train_y_raw,
val_x_raw, val_y_raw, test_x_raw,
test_y_raw)
# Train an auto encoder of size 4096
encoder = get_encoder(train_x, test_x, 4096)
# use auto encoder to encode the train, validate and test sets
encoded_train = encoder.predict(train_x)
encoded_test = encoder.predict(test_x)
encoded_val = encoder.predict(val_x)
# train the neural network model and calculate the precision, recall, f1 score, and accuracy
print('neural net ae')
model = _get_nn_model_bag_of_words_simple_scratch(encoded_train, train_y, encoded_val, val_y,
data_reader.get_region_labels()['Code'], epochs=100,
batch_size=256)
eval_nn(model, encoded_test, test_y)
evaluate_model_nn(model, encoded_test, test_y)
# train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
print('logistic regression ae')
model = MultiClassLogisticRegression()
model.train(encoded_train, train_y)
model_obj = lambda: None
model_obj.model = model
eval_model(model_obj, encoded_test, test_y)
evaluate_model(model, encoded_test, test_y)
# train the random forest model and calculate the precision, recall, f1 score, and accuracy
print('random forest ae')
model = RandomForest()
model.train(encoded_train, train_y)
model_obj = lambda: None
model_obj.model = model
eval_model(model_obj, encoded_test, test_y)
evaluate_model(model, encoded_test, test_y)
# train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
print('naive bayes ae')
model = NaiveBayes()
model.train(encoded_train, train_y)
model_obj = lambda: None
model_obj.model = model
eval_model(model_obj, encoded_test, test_y)
evaluate_model(model, encoded_test, test_y)
def eval():
# load data
data_reader = DataReader()
df = data_reader.get_all_data()
train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)
# get bag of words
train_x, train_y, test_x, test_y = bag_of_words_full_no_empty(train_x_raw, train_y_raw, test_x_raw, test_y_raw)
#train logistic regression, random forest and naive bayes on bag of words, and run get accuracy, precision, recall, f1score
log_reg_bow = _get_multiclass_logistic_regression_model_bag_of_words_full(train_x, train_y)
eval_model(log_reg_bow, test_x, test_y)
evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
rand_for_bow = _get_random_forest_model_bag_of_words_full(train_x, train_y)
eval_model(rand_for_bow, test_x, test_y)
evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
nb_bow = _get_naive_bayes_model_bag_of_words_full(train_x, train_y)
eval_model(nb_bow, test_x.A, test_y)
evaluate_model(nb_bow, test_x.A, test_y, plot_roc=False)
# get tfidf
train_x, train_y, test_x, test_y = tfidf_no_empty(train_x_raw, train_y_raw, test_x_raw, test_y_raw)
# train logistic regression, random forest and naive bayes on tfidf, and run get accuracy, precision, recall, f1score
log_reg_bow = _get_multiclass_logistic_regression_model_tfidf(train_x, train_y)
eval_model(log_reg_bow, test_x, test_y)
evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
rand_for_bow = _get_random_forest_model_tfidf(train_x, train_y)
print("random forest, tfidf")
eval_model(rand_for_bow, test_x, test_y)
evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
nb_bow = _get_naive_bayes_model_tfidf(train_x.A, train_y)
print("naive bayes, tfidf")
eval_model(nb_bow, test_x.A, test_y)
evaluate_model(nb_bow, test_x, test_y, plot_roc=False)
# get doc2vec
train_x, train_y, test_x, test_y = doc2vec_simple(train_x_raw, train_y_raw, test_x_raw, test_y_raw)
# train logistic regression, random forest and naive bayes on doc2vec, and run get accuracy, precision, recall, f1score
log_reg_bow = _get_multiclass_logistic_regression_model_doc2vec_simple(train_x, train_y)
print("bow, doc2vec")
eval_model(log_reg_bow, test_x, test_y)
evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
rand_for_bow = _get_random_forest_model_doc2vec_simple(train_x, train_y)
print("random forest, doc2evc")
eval_model(rand_for_bow, test_x, test_y)
evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
nb_bow = _get_naive_bayes_model_doc2vec_simple(train_x, train_y)
print("naive bayes, doc2vec")
eval_model(nb_bow, test_x, test_y)
evaluate_model(nb_bow, test_x, test_y, plot_roc=False)
def per_site_accuracy_increase():
# load data
data_reader = DataReader()
df = data_reader.get_all_data()
all_tokens, _ = tokenize(df, df, save_missing_feature_as_string=False, remove_empty=True)
_, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens)
lst = []
from random import shuffle
#split data on source hospital and save to seperate dataframes in a list
for i in df['src_file'].unique():
lst.append(df[df['src_file'] == i])
from Models.neural_net import MultiClassNNScratch
# save an empty neural network so we can quickly reset the network
model = MultiClassNNScratch((0, len(vocab)), np.array(data_reader.get_region_labels()['Code']), epochs=100,
batch_size=256)
model.model.save_weights('empty_model.h5')
# run evaluation some n times
for i in range(30):
# shuffle the order
shuffle(lst)
# iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1.
# Print results to file so we can easily visualize later.
# each run of the 30 gets its own file.
i = 1
file = open("output_dir/" + randomword(7) + '.txt', "w")
while i < len(lst):
model.model.load_weights('empty_model.h5')
train_set = lst[:i]
test_set = lst[i:]
test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set))
test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False,
remove_empty=True)
test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=vocab)
item = pd.concat(train_set)
train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split(item)
train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True)
train_x, train_y, _ = tokens_to_bagofwords(train_tokens, train_y_raw, feature_names=vocab)
val_tokens, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True)
val_x, val_y, _ = tokens_to_bagofwords(val_tokens, val_y_raw, feature_names=vocab)
model.set_train_data(train_x, train_y)
model.train(val_x, val_y)
accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False)
file.write("%d, %d, %4.2f, %d"%(len(train_set), len(test_set), accuracy, len(item)))
i+=1
file.close()
def top_keywords_kmeans():
# get data
data_reader = DataReader()
df = data_reader.get_all_data()
train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)
train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
# identify ON WG IDENTIFIERS that occur infrequently
min_samples = 5
train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist()
unique_ids = list(set(train_y_list))
small_clusters = list()
for i in unique_ids:
if train_y_list.count(i) < min_samples:
small_clusters.append(i)
train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER'].isin(small_clusters)]
train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER'].isin(small_clusters)]
num_clusters = len(unique_ids) - len(small_clusters)
# append the ON WG IDENTIFIERS to the original documents
train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1)
test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1)
# tokenize and subsample
tokens_train, train_y_raw = tokenize_columns(train_x_raw, train_y_raw, regex_string=r'[a-zA-Z0-9]+',
save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True)
tokens_test, test_y_raw = tokenize_columns(test_x_raw, test_y_raw, regex_string=r'[a-zA-Z0-9]+',
save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True)
# get TF-IDF representation of data
feature_names = list()
train_x = list()
train_y = list()
test_x = list()
test_y = list()
train_x, train_y, feature_names = tokens_to_bagofwords(tokens_train, train_y_raw, TfidfVectorizer)
test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, TfidfVectorizer, feature_names=feature_names)
train_x = train_x.toarray()
test_x = test_x.toarray()
# run kmeans
kmeans = Kmeans(num_clusters, feature_names, train_x, train_y, "tfidf")
kmeans.eval()
labels = kmeans.get_labels()
# get top 10 keywords for each cluster
n_terms = 10
# group by clusters and get the mean occurence of each word
df = pd.DataFrame(train_x).groupby(labels).mean()
# iterate through each cluster and get the most frequent occuring words
for i,r in df.iterrows():
print('Cluster {}: '.format(i) + ','.join([str(feature_names[t]) for t in np.argsort(r)[-n_terms:]]))
if __name__ == '__main__':
main()