regional_procedure_mapping/main.py at master · saliaqat/regional_procedure_mapping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
from Models.logistic_regression import BinaryLogisticRegressionModel
from Models.model import Model
from Models.clustermodel import Kmeans
from data_manipulator_interface import bag_of_words_full_no_empty, tfidf_no_empty, doc2vec_simple, bag_of_words_full_no_empty_val_no_num_no_short_no_repeat
from data_reader import DataReader
from data_manipulator import *
from cached_models import _get_nn_model_bag_of_words_simple_scratch
from cached_models import _get_random_forest_model_bag_of_words_full
from cached_models import _get_naive_bayes_model_bag_of_words_full
from cached_models import _get_multiclass_logistic_regression_model_tfidf
from cached_models import _get_random_forest_model_tfidf
from cached_models import _get_naive_bayes_model_tfidf
from cached_models import _get_multiclass_logistic_regression_model_doc2vec_simple
from cached_models import _get_random_forest_model_doc2vec_simple
from cached_models import _get_naive_bayes_model_doc2vec_simple
from cached_models import _get_multiclass_logistic_regression_model_bag_of_words_full
from supervised_methods import eval_nn, evaluate_model_nn, eval_model, evaluate_model, randomword
from run_autoencoder import get_encoder

import pandas as pd

# Code to get evaluation results
def main():
    # Calculates the qualitative method evaluation.
    # Evaluate logistic regression, random forest, naive bayes and neural network on a pub med word2vec representation
#    eval_pub_med()
    # Evaluate logistic regression, random forest, naive bayes and neural network on a autoencoder + bagofwords representation
#    eval_ae()
    # Evaluate logistic regression, random forest, naive bayes and neural network on a bag of words, tfidf and doc2vec represetnation
#    eval()

    # Runs the generalizability check on nn and bow
    # Results saved to a set of files in output_dir. Files will have random name of 7 characters.
#    per_site_accuracy_increase()

    # Runs kmeans and gets keywords
    top_keywords_kmeans()
    pass

def eval_pub_med():
    from gensim.models.keyedvectors import KeyedVectors
    # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
    # Load the pubmed model
    model = KeyedVectors.load_word2vec_format('wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
    # Load data into train/validate/test sets
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(df)
    tokens_train, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False,
                                       remove_empty=True)
    # for the each tokenized vector in the train set, run the model on each word and take the average.
    # If no words are vectorized by pubmed, append an 0 vector
    avg = []
    for item in tokens_train:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_train = np.array(avg)

    # run the same for the validation set
    tokens_val, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True)
    avg = []
    for item in tokens_val:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_val = np.array(avg)

    # run the same for the test set
    tokens_test, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True)
    avg = []
    for item in tokens_test:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_test = np.array(avg)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, nn")
    nn_model = _get_nn_model_bag_of_words_simple_scratch(pub_med_train, train_y_raw, pub_med_val, val_y_raw,
                                                         data_reader.get_region_labels()['Code'], epochs=100, batch_size=256)
    eval_nn(nn_model, pub_med_test, test_y_raw)
    evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, logistic regression")
    from Models.logistic_regression import MultiClassLogisticRegression
    log_reg = MultiClassLogisticRegression()
    log_reg.train(pub_med_train, train_y_raw)
    eval_model(log_reg, pub_med_test, test_y_raw)
    evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False)
    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, random forest")
    from Models.random_forest import RandomForest
    rand_for = RandomForest()
    rand_for.train(pub_med_train, train_y_raw)
    eval_model(rand_for, pub_med_test, test_y_raw)
    evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False)
    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, naivebayes")
    from Models.naive_bayes import NaiveBayes
    nb = NaiveBayes()
    nb.train(pub_med_train, train_y_raw)
    eval_model(nb, pub_med_test, test_y_raw)
    evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)

def eval_ae():
    from Models.logistic_regression import MultiClassLogisticRegression
    from Models.random_forest import RandomForest
    from Models.naive_bayes import NaiveBayes
    from Models.svm import SVM
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(df)
    train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
        train_x_raw, train_y_raw,
        val_x_raw, val_y_raw, test_x_raw,
        test_y_raw)
    # Train an auto encoder of size 4096
    encoder = get_encoder(train_x, test_x, 4096)
    # use auto encoder to encode the train, validate and test sets
    encoded_train = encoder.predict(train_x)
    encoded_test = encoder.predict(test_x)
    encoded_val = encoder.predict(val_x)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print('neural net ae')
    model = _get_nn_model_bag_of_words_simple_scratch(encoded_train, train_y, encoded_val, val_y,
                                                      data_reader.get_region_labels()['Code'], epochs=100,
                                                      batch_size=256)
    eval_nn(model, encoded_test, test_y)
    evaluate_model_nn(model, encoded_test, test_y)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print('logistic regression ae')
    model = MultiClassLogisticRegression()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print('random forest ae')
    model = RandomForest()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print('naive bayes ae')
    model = NaiveBayes()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

def eval():
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    # get bag of words
    train_x, train_y, test_x, test_y = bag_of_words_full_no_empty(train_x_raw, train_y_raw, test_x_raw, test_y_raw)

    #train logistic regression, random forest and naive bayes on bag of words, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_bag_of_words_full(train_x, train_y)
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_bag_of_words_full(train_x, train_y)
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_bag_of_words_full(train_x, train_y)
    eval_model(nb_bow, test_x.A, test_y)
    evaluate_model(nb_bow, test_x.A, test_y, plot_roc=False)

    # get tfidf
    train_x, train_y, test_x, test_y = tfidf_no_empty(train_x_raw, train_y_raw, test_x_raw, test_y_raw)

    # train logistic regression, random forest and naive bayes on tfidf, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_tfidf(train_x, train_y)
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_tfidf(train_x, train_y)
    print("random forest, tfidf")
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_tfidf(train_x.A, train_y)
    print("naive bayes, tfidf")
    eval_model(nb_bow, test_x.A, test_y)
    evaluate_model(nb_bow, test_x, test_y, plot_roc=False)

    # get doc2vec
    train_x, train_y, test_x, test_y = doc2vec_simple(train_x_raw, train_y_raw, test_x_raw, test_y_raw)

    # train logistic regression, random forest and naive bayes on doc2vec, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_doc2vec_simple(train_x, train_y)
    print("bow, doc2vec")
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_doc2vec_simple(train_x, train_y)
    print("random forest, doc2evc")
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_doc2vec_simple(train_x, train_y)
    print("naive bayes, doc2vec")
    eval_model(nb_bow, test_x, test_y)
    evaluate_model(nb_bow, test_x, test_y, plot_roc=False)

def per_site_accuracy_increase():
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    all_tokens, _ = tokenize(df, df, save_missing_feature_as_string=False, remove_empty=True)
    _, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens)

    lst = []
    from random import shuffle

    #split data on source hospital and save to seperate dataframes in a list
    for i in df['src_file'].unique():
        lst.append(df[df['src_file'] == i])

    from Models.neural_net import MultiClassNNScratch

    # save an empty neural network so we can quickly reset the network
    model = MultiClassNNScratch((0, len(vocab)), np.array(data_reader.get_region_labels()['Code']), epochs=100,
                                batch_size=256)
    model.model.save_weights('empty_model.h5')

    # run evaluation some n times
    for i in range(30):
        # shuffle the order
        shuffle(lst)
        # iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1.
        # Print results to file so we can easily visualize later.
        # each run of the 30 gets its own file.
        i = 1
        file = open("output_dir/" + randomword(7) + '.txt', "w")
        while i < len(lst):
            model.model.load_weights('empty_model.h5')
            train_set = lst[:i]
            test_set = lst[i:]

            test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set))
            test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False,
                                               remove_empty=True)
            test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=vocab)

            item = pd.concat(train_set)
            train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split(item)
            train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True)
            train_x, train_y, _ = tokens_to_bagofwords(train_tokens, train_y_raw, feature_names=vocab)

            val_tokens, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True)
            val_x, val_y, _ = tokens_to_bagofwords(val_tokens, val_y_raw, feature_names=vocab)

            model.set_train_data(train_x, train_y)
            model.train(val_x, val_y)

            accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False)
            file.write("%d, %d, %4.2f, %d"%(len(train_set), len(test_set), accuracy, len(item)))

            i+=1
        file.close()

def top_keywords_kmeans():
    # get data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)
    train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
    test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)

    # identify ON WG IDENTIFIERS that occur infrequently
    min_samples = 5
    train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist()
    unique_ids = list(set(train_y_list))
    small_clusters = list()
    for i in unique_ids:
        if train_y_list.count(i) < min_samples:
            small_clusters.append(i)
    train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER'].isin(small_clusters)]
    train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER'].isin(small_clusters)]
    num_clusters = len(unique_ids) - len(small_clusters)

    # append the ON WG IDENTIFIERS to the original documents
    train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1)
    test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1)

    # tokenize and subsample
    tokens_train, train_y_raw = tokenize_columns(train_x_raw, train_y_raw, regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True)
    tokens_test, test_y_raw = tokenize_columns(test_x_raw, test_y_raw, regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True)

    # get TF-IDF representation of data
    feature_names = list()
    train_x = list()
    train_y = list()
    test_x = list()
    test_y = list()
    train_x, train_y, feature_names = tokens_to_bagofwords(tokens_train, train_y_raw, TfidfVectorizer)
    test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, TfidfVectorizer, feature_names=feature_names)
    train_x = train_x.toarray()
    test_x = test_x.toarray()

    # run kmeans
    kmeans = Kmeans(num_clusters, feature_names, train_x, train_y, "tfidf")
    kmeans.eval()
    labels = kmeans.get_labels()

    # get top 10 keywords for each cluster
    n_terms = 10
    # group by clusters and get the mean occurence of each word
    df = pd.DataFrame(train_x).groupby(labels).mean()
    # iterate through each cluster and get the most frequent occuring words
    for i,r in df.iterrows():
        print('Cluster {}: '.format(i) + ','.join([str(feature_names[t]) for t in np.argsort(r)[-n_terms:]]))

if __name__ == '__main__':
    main()