|
| 1 | + |
| 2 | +from keras.layers.core import Dense, Activation, Dropout |
| 3 | +from keras.layers.recurrent import LSTM |
| 4 | +from keras.layers import Bidirectional |
| 5 | +from keras.models import Sequential |
| 6 | +from keras.models import load_model |
| 7 | + |
| 8 | +from sklearn.utils import shuffle |
| 9 | +from sklearn.metrics import accuracy_score |
| 10 | + |
| 11 | +import math |
| 12 | +import time |
| 13 | +import numpy as np |
| 14 | +import random |
| 15 | +from random import randint |
| 16 | +random.seed( 3 ) |
| 17 | +import datetime, re, operator |
| 18 | +from random import shuffle |
| 19 | + |
| 20 | +import os |
| 21 | +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #get rid of warnings |
| 22 | +from os import listdir |
| 23 | +from os.path import isfile, join, isdir |
| 24 | +import pickle |
| 25 | + |
| 26 | +from nlp_aug import * |
| 27 | + |
| 28 | +################################################### |
| 29 | +######### loading folders and txt files ########### |
| 30 | +################################################### |
| 31 | + |
| 32 | +#loading a pickle file |
| 33 | +def load_pickle(file): |
| 34 | + return pickle.load(open(file, 'rb')) |
| 35 | + |
| 36 | +#create an output folder if it does not already exist |
| 37 | +def confirm_output_folder(output_folder): |
| 38 | + if not os.path.exists(output_folder): |
| 39 | + os.makedirs(output_folder) |
| 40 | + |
| 41 | +#get full image paths |
| 42 | +def get_txt_paths(folder): |
| 43 | + txt_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.txt' in f] |
| 44 | + if join(folder, '.DS_Store') in txt_paths: |
| 45 | + txt_paths.remove(join(folder, '.DS_Store')) |
| 46 | + txt_paths = sorted(txt_paths) |
| 47 | + return txt_paths |
| 48 | + |
| 49 | +#get subfolders |
| 50 | +def get_subfolder_paths(folder): |
| 51 | + subfolder_paths = [join(folder, f) for f in listdir(folder) if (isdir(join(folder, f)) and '.DS_Store' not in f)] |
| 52 | + if join(folder, '.DS_Store') in subfolder_paths: |
| 53 | + subfolder_paths.remove(join(folder, '.DS_Store')) |
| 54 | + subfolder_paths = sorted(subfolder_paths) |
| 55 | + return subfolder_paths |
| 56 | + |
| 57 | +#get all image paths |
| 58 | +def get_all_txt_paths(master_folder): |
| 59 | + |
| 60 | + all_paths = [] |
| 61 | + subfolders = get_subfolder_paths(master_folder) |
| 62 | + if len(subfolders) > 1: |
| 63 | + for subfolder in subfolders: |
| 64 | + all_paths += get_txt_paths(subfolder) |
| 65 | + else: |
| 66 | + all_paths = get_txt_paths(master_folder) |
| 67 | + return all_paths |
| 68 | + |
| 69 | +################################################### |
| 70 | +################ data processing ################## |
| 71 | +################################################### |
| 72 | + |
| 73 | +#get the pickle file for the vocab so you don't have to load the entire dictionary |
| 74 | +def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec): |
| 75 | + |
| 76 | + vocab = set() |
| 77 | + text_embeddings = open(huge_word2vec, 'r').readlines() |
| 78 | + word2vec = {} |
| 79 | + |
| 80 | + #get all the vocab |
| 81 | + all_txt_paths = get_all_txt_paths(folder) |
| 82 | + print(all_txt_paths) |
| 83 | + |
| 84 | + #loop through each text file |
| 85 | + for txt_path in all_txt_paths: |
| 86 | + |
| 87 | + # get all the words |
| 88 | + try: |
| 89 | + all_lines = open(txt_path, "r").readlines() |
| 90 | + for line in all_lines: |
| 91 | + words = line[:-1].split(' ') |
| 92 | + for word in words: |
| 93 | + vocab.add(word) |
| 94 | + except: |
| 95 | + print(txt_path, "has an error") |
| 96 | + |
| 97 | + print(len(vocab), "unique words found") |
| 98 | + |
| 99 | + # load the word embeddings, and only add the word to the dictionary if we need it |
| 100 | + for line in text_embeddings: |
| 101 | + items = line.split(' ') |
| 102 | + word = items[0] |
| 103 | + if word in vocab: |
| 104 | + vec = items[1:] |
| 105 | + word2vec[word] = np.asarray(vec, dtype = 'float32') |
| 106 | + print(len(word2vec), "matches between unique words and word2vec dictionary") |
| 107 | + |
| 108 | + pickle.dump(word2vec, open(output_pickle_path, 'wb')) |
| 109 | + print("dictionaries outputted to", output_pickle_path) |
| 110 | + |
| 111 | +#generate more data with standard augmentation |
| 112 | +def gen_standard_aug(train_orig, output_file): |
| 113 | + writer = open(output_file, 'w') |
| 114 | + lines = open(train_orig, 'r').readlines() |
| 115 | + for i, line in enumerate(lines): |
| 116 | + parts = line[:-1].split('\t') |
| 117 | + label = parts[0] |
| 118 | + sentence = parts[1] |
| 119 | + aug_sentences = standard_augmentation(sentence) |
| 120 | + for aug_sentence in aug_sentences: |
| 121 | + writer.write(label + "\t" + aug_sentence + '\n') |
| 122 | + writer.close() |
| 123 | + |
| 124 | + |
| 125 | +#getting the x and y inputs in numpy array form from the text file |
| 126 | +def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset): |
| 127 | + |
| 128 | + #read in lines |
| 129 | + train_lines = open(train_txt, 'r').readlines() |
| 130 | + shuffle(train_lines) |
| 131 | + train_lines = train_lines[:int(percent_dataset*len(train_lines))] |
| 132 | + num_lines = len(train_lines) |
| 133 | + |
| 134 | + #initialize x and y matrix |
| 135 | + x_matrix = np.zeros((num_lines, input_size, word2vec_len)) |
| 136 | + y_matrix = np.zeros((num_lines)) |
| 137 | + |
| 138 | + #insert values |
| 139 | + for i, line in enumerate(train_lines): |
| 140 | + |
| 141 | + parts = line[:-1].split('\t') |
| 142 | + label = int(parts[0]) |
| 143 | + sentence = parts[1] |
| 144 | + |
| 145 | + #insert x |
| 146 | + words = sentence.split(' ') |
| 147 | + words = words[:x_matrix.shape[1]] #cut off if too long |
| 148 | + for j, word in enumerate(words): |
| 149 | + if word in word2vec: |
| 150 | + x_matrix[i, j, :] = word2vec[word] |
| 151 | + |
| 152 | + #insert y |
| 153 | + y_matrix[i] = label |
| 154 | + |
| 155 | + return x_matrix, y_matrix |
| 156 | + |
| 157 | + |
| 158 | +################################################### |
| 159 | +##################### model ####################### |
| 160 | +################################################### |
| 161 | + |
| 162 | +#building the model in keras |
| 163 | +def build_model(sentence_length, word2vec_len): |
| 164 | + model = None |
| 165 | + model = Sequential() |
| 166 | + model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sentence_length, word2vec_len))) |
| 167 | + model.add(Dropout(0.5)) |
| 168 | + model.add(Bidirectional(LSTM(50, return_sequences=False))) |
| 169 | + model.add(Dropout(0.5)) |
| 170 | + model.add(Dense(20, activation='relu')) |
| 171 | + model.add(Dense(1, kernel_initializer='normal', activation='sigmoid')) |
| 172 | + model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
| 173 | + #print(model.summary()) |
| 174 | + return model |
| 175 | + |
| 176 | +#confidences to binary |
| 177 | +def conf_to_pred(y): |
| 178 | + |
| 179 | + if type(y) == list: |
| 180 | + y_class = [] |
| 181 | + for pred in y: |
| 182 | + if pred < 0.5: |
| 183 | + y_class.append(0) |
| 184 | + else: |
| 185 | + y_class.append(1) |
| 186 | + return y_class |
| 187 | + |
| 188 | + else: |
| 189 | + y_class = np.zeros(y.shape) |
| 190 | + for i in range(y.shape[0]): |
| 191 | + if y[i] < 0.5: |
| 192 | + y_class[i] = 0 |
| 193 | + else: |
| 194 | + y_class[i] = 1 |
| 195 | + return y_class |
| 196 | + |
| 197 | + |
| 198 | + |
| 199 | + |
| 200 | + |
0 commit comments