1
-
2
1
from keras .layers .core import Dense , Activation , Dropout
3
2
from keras .layers .recurrent import LSTM
4
3
from keras .layers import Bidirectional
13
12
import numpy as np
14
13
import random
15
14
from random import randint
16
- random .seed ( 3 )
15
+ random .seed (3 )
17
16
import datetime , re , operator
18
17
from random import shuffle
19
18
23
22
from os .path import isfile , join , isdir
24
23
import pickle
25
24
25
+ #import data augmentation methods
26
26
from nlp_aug import *
27
27
28
28
###################################################
@@ -70,7 +70,7 @@ def get_all_txt_paths(master_folder):
70
70
################ data processing ##################
71
71
###################################################
72
72
73
- #get the pickle file for the vocab so you don't have to load the entire dictionary
73
+ #get the pickle file for the word2vec so you don't have to load the entire huge file each time
74
74
def gen_vocab_dicts (folder , output_pickle_path , huge_word2vec ):
75
75
76
76
vocab = set ()
@@ -108,22 +108,8 @@ def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
108
108
pickle .dump (word2vec , open (output_pickle_path , 'wb' ))
109
109
print ("dictionaries outputted to" , output_pickle_path )
110
110
111
- #generate more data with standard augmentation
112
- def gen_standard_aug (train_orig , output_file ):
113
- writer = open (output_file , 'w' )
114
- lines = open (train_orig , 'r' ).readlines ()
115
- for i , line in enumerate (lines ):
116
- parts = line [:- 1 ].split ('\t ' )
117
- label = parts [0 ]
118
- sentence = parts [1 ]
119
- aug_sentences = standard_augmentation (sentence )
120
- for aug_sentence in aug_sentences :
121
- writer .write (label + "\t " + aug_sentence + '\n ' )
122
- writer .close ()
123
-
124
-
125
111
#getting the x and y inputs in numpy array form from the text file
126
- def get_x_y (train_txt , word2vec_len , input_size , word2vec , percent_dataset ):
112
+ def get_x_y (train_txt , num_classes , word2vec_len , input_size , word2vec , percent_dataset ):
127
113
128
114
#read in lines
129
115
train_lines = open (train_txt , 'r' ).readlines ()
@@ -133,7 +119,7 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
133
119
134
120
#initialize x and y matrix
135
121
x_matrix = np .zeros ((num_lines , input_size , word2vec_len ))
136
- y_matrix = np .zeros ((num_lines ))
122
+ y_matrix = np .zeros ((num_lines , num_classes ))
137
123
138
124
#insert values
139
125
for i , line in enumerate (train_lines ):
@@ -150,50 +136,49 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
150
136
x_matrix [i , j , :] = word2vec [word ]
151
137
152
138
#insert y
153
- y_matrix [i ] = label
139
+ y_matrix [i ][ label ] = 1.0
154
140
155
141
return x_matrix , y_matrix
156
142
143
+ ###################################################
144
+ ############### data augmentation #################
145
+ ###################################################
146
+
147
+ #generate more data with standard augmentation
148
+ def gen_standard_aug (train_orig , output_file ):
149
+ writer = open (output_file , 'w' )
150
+ lines = open (train_orig , 'r' ).readlines ()
151
+ for i , line in enumerate (lines ):
152
+ parts = line [:- 1 ].split ('\t ' )
153
+ label = parts [0 ]
154
+ sentence = parts [1 ]
155
+ aug_sentences = standard_augmentation (sentence )
156
+ for aug_sentence in aug_sentences :
157
+ writer .write (label + "\t " + aug_sentence + '\n ' )
158
+ writer .close ()
157
159
158
160
###################################################
159
161
##################### model #######################
160
162
###################################################
161
163
162
164
#building the model in keras
163
- def build_model (sentence_length , word2vec_len ):
165
+ def build_model (sentence_length , word2vec_len , num_classes ):
164
166
model = None
165
167
model = Sequential ()
166
- model .add (Bidirectional (LSTM (50 , return_sequences = True ), input_shape = (sentence_length , word2vec_len )))
168
+ model .add (Bidirectional (LSTM (128 , return_sequences = True ), input_shape = (sentence_length , word2vec_len )))
167
169
model .add (Dropout (0.5 ))
168
- model .add (Bidirectional (LSTM (50 , return_sequences = False )))
170
+ model .add (Bidirectional (LSTM (128 , return_sequences = False )))
169
171
model .add (Dropout (0.5 ))
170
172
model .add (Dense (20 , activation = 'relu' ))
171
- model .add (Dense (1 , kernel_initializer = 'normal' , activation = 'sigmoid ' ))
172
- model .compile (loss = 'binary_crossentropy ' , optimizer = 'adam' , metrics = ['accuracy' ])
173
+ model .add (Dense (num_classes , kernel_initializer = 'normal' , activation = 'softmax ' ))
174
+ model .compile (loss = 'categorical_crossentropy ' , optimizer = 'adam' , metrics = ['accuracy' ])
173
175
#print(model.summary())
174
176
return model
175
177
176
- #confidences to binary
177
- def conf_to_pred (y ):
178
-
179
- if type (y ) == list :
180
- y_class = []
181
- for pred in y :
182
- if pred < 0.5 :
183
- y_class .append (0 )
184
- else :
185
- y_class .append (1 )
186
- return y_class
187
-
188
- else :
189
- y_class = np .zeros (y .shape )
190
- for i in range (y .shape [0 ]):
191
- if y [i ] < 0.5 :
192
- y_class [i ] = 0
193
- else :
194
- y_class [i ] = 1
195
- return y_class
196
-
178
+ #one hot to categorical
179
+ def one_hot_to_categorical (y ):
180
+ assert len (y .shape ) == 2
181
+ return np .argmax (y , axis = 1 )
197
182
198
183
199
184
0 commit comments