Skip to content

Commit f3a01a2

Browse files
committed
first commit
1 parent 3fac69b commit f3a01a2

13 files changed

+636
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
datasets*
2+
word2vec*
3+

code/1_data_process.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from methods import *
2+
from config import *
3+
4+
if __name__ == "__main__":
5+
6+
#generate the augmented data sets
7+
print("augmenting data")
8+
gen_standard_aug(train_orig, train_aug_st)
9+
print("done")
10+
11+
#generate the vocab dictionary
12+
print("generating word 2 vec")
13+
gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)
14+
print("done")

code/2_train_eval.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from config import *
2+
from methods import *
3+
from numpy.random import seed
4+
seed(0)
5+
6+
def run_model(train_file, test_file, percent_dataset, epochs_base):
7+
8+
#initialize model
9+
model = build_model(input_size, word2vec_len)
10+
11+
#load data
12+
word2vec = load_pickle(word2vec_pickle)
13+
train_x, train_y = get_x_y(train_file, word2vec_len, input_size, word2vec, percent_dataset)
14+
print("loaded data with shape:", train_x.shape, train_y.shape)
15+
test_x, test_y = get_x_y(test_file, word2vec_len, input_size, word2vec, 1)
16+
17+
#train model
18+
n_epochs = min(500, int(epochs_base/percent_dataset))
19+
model.fit(train_x, train_y, batch_size=1024, epochs=n_epochs, validation_split=0.1, shuffle=True, verbose=0)
20+
#model = load_model('lol')
21+
#model.save('lol')
22+
23+
#evaluate model
24+
y_pred = model.predict(test_x)
25+
y_pred_binary = conf_to_pred(y_pred)
26+
acc = accuracy_score(test_y, y_pred_binary)
27+
28+
#return the accuracy
29+
print(train_file, test_file, percent_dataset, acc)
30+
return acc
31+
32+
if __name__ == "__main__":
33+
34+
#get the accuracy at each increment
35+
orig_accs = []
36+
aug_accs = []
37+
38+
for increment in increments:
39+
orig_acc = run_model(train_orig, test_path, increment, epochs_base=5)
40+
orig_accs.append(orig_acc)
41+
aug_acc = run_model(train_aug_st, test_path, increment, epochs_base=2)
42+
aug_accs.append(aug_acc)
43+
44+
print(orig_accs, aug_accs)
514 Bytes
Binary file not shown.
4.8 KB
Binary file not shown.
5.29 KB
Binary file not shown.

code/config.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
3+
#user inputs
4+
dataset_folder = 'datasets/subj'
5+
input_size = 50 #number of words
6+
huge_word2vec = 'word2vec/glove.840B.300d.txt'
7+
word2vec_len = 300
8+
word2vec_pickle = dataset_folder + '/word2vec.p' # don't want to load the huge pickle every time, so just save the words that are actually used into a smaller dictionary
9+
increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
10+
11+
train_orig = dataset_folder + '/train_orig.txt'
12+
train_aug_st = dataset_folder + '/train_aug_st.txt'
13+
test_path = dataset_folder + '/test.txt'

code/methods.py

+200
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
2+
from keras.layers.core import Dense, Activation, Dropout
3+
from keras.layers.recurrent import LSTM
4+
from keras.layers import Bidirectional
5+
from keras.models import Sequential
6+
from keras.models import load_model
7+
8+
from sklearn.utils import shuffle
9+
from sklearn.metrics import accuracy_score
10+
11+
import math
12+
import time
13+
import numpy as np
14+
import random
15+
from random import randint
16+
random.seed( 3 )
17+
import datetime, re, operator
18+
from random import shuffle
19+
20+
import os
21+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #get rid of warnings
22+
from os import listdir
23+
from os.path import isfile, join, isdir
24+
import pickle
25+
26+
from nlp_aug import *
27+
28+
###################################################
29+
######### loading folders and txt files ###########
30+
###################################################
31+
32+
#loading a pickle file
33+
def load_pickle(file):
34+
return pickle.load(open(file, 'rb'))
35+
36+
#create an output folder if it does not already exist
37+
def confirm_output_folder(output_folder):
38+
if not os.path.exists(output_folder):
39+
os.makedirs(output_folder)
40+
41+
#get full image paths
42+
def get_txt_paths(folder):
43+
txt_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.txt' in f]
44+
if join(folder, '.DS_Store') in txt_paths:
45+
txt_paths.remove(join(folder, '.DS_Store'))
46+
txt_paths = sorted(txt_paths)
47+
return txt_paths
48+
49+
#get subfolders
50+
def get_subfolder_paths(folder):
51+
subfolder_paths = [join(folder, f) for f in listdir(folder) if (isdir(join(folder, f)) and '.DS_Store' not in f)]
52+
if join(folder, '.DS_Store') in subfolder_paths:
53+
subfolder_paths.remove(join(folder, '.DS_Store'))
54+
subfolder_paths = sorted(subfolder_paths)
55+
return subfolder_paths
56+
57+
#get all image paths
58+
def get_all_txt_paths(master_folder):
59+
60+
all_paths = []
61+
subfolders = get_subfolder_paths(master_folder)
62+
if len(subfolders) > 1:
63+
for subfolder in subfolders:
64+
all_paths += get_txt_paths(subfolder)
65+
else:
66+
all_paths = get_txt_paths(master_folder)
67+
return all_paths
68+
69+
###################################################
70+
################ data processing ##################
71+
###################################################
72+
73+
#get the pickle file for the vocab so you don't have to load the entire dictionary
74+
def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
75+
76+
vocab = set()
77+
text_embeddings = open(huge_word2vec, 'r').readlines()
78+
word2vec = {}
79+
80+
#get all the vocab
81+
all_txt_paths = get_all_txt_paths(folder)
82+
print(all_txt_paths)
83+
84+
#loop through each text file
85+
for txt_path in all_txt_paths:
86+
87+
# get all the words
88+
try:
89+
all_lines = open(txt_path, "r").readlines()
90+
for line in all_lines:
91+
words = line[:-1].split(' ')
92+
for word in words:
93+
vocab.add(word)
94+
except:
95+
print(txt_path, "has an error")
96+
97+
print(len(vocab), "unique words found")
98+
99+
# load the word embeddings, and only add the word to the dictionary if we need it
100+
for line in text_embeddings:
101+
items = line.split(' ')
102+
word = items[0]
103+
if word in vocab:
104+
vec = items[1:]
105+
word2vec[word] = np.asarray(vec, dtype = 'float32')
106+
print(len(word2vec), "matches between unique words and word2vec dictionary")
107+
108+
pickle.dump(word2vec, open(output_pickle_path, 'wb'))
109+
print("dictionaries outputted to", output_pickle_path)
110+
111+
#generate more data with standard augmentation
112+
def gen_standard_aug(train_orig, output_file):
113+
writer = open(output_file, 'w')
114+
lines = open(train_orig, 'r').readlines()
115+
for i, line in enumerate(lines):
116+
parts = line[:-1].split('\t')
117+
label = parts[0]
118+
sentence = parts[1]
119+
aug_sentences = standard_augmentation(sentence)
120+
for aug_sentence in aug_sentences:
121+
writer.write(label + "\t" + aug_sentence + '\n')
122+
writer.close()
123+
124+
125+
#getting the x and y inputs in numpy array form from the text file
126+
def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
127+
128+
#read in lines
129+
train_lines = open(train_txt, 'r').readlines()
130+
shuffle(train_lines)
131+
train_lines = train_lines[:int(percent_dataset*len(train_lines))]
132+
num_lines = len(train_lines)
133+
134+
#initialize x and y matrix
135+
x_matrix = np.zeros((num_lines, input_size, word2vec_len))
136+
y_matrix = np.zeros((num_lines))
137+
138+
#insert values
139+
for i, line in enumerate(train_lines):
140+
141+
parts = line[:-1].split('\t')
142+
label = int(parts[0])
143+
sentence = parts[1]
144+
145+
#insert x
146+
words = sentence.split(' ')
147+
words = words[:x_matrix.shape[1]] #cut off if too long
148+
for j, word in enumerate(words):
149+
if word in word2vec:
150+
x_matrix[i, j, :] = word2vec[word]
151+
152+
#insert y
153+
y_matrix[i] = label
154+
155+
return x_matrix, y_matrix
156+
157+
158+
###################################################
159+
##################### model #######################
160+
###################################################
161+
162+
#building the model in keras
163+
def build_model(sentence_length, word2vec_len):
164+
model = None
165+
model = Sequential()
166+
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
167+
model.add(Dropout(0.5))
168+
model.add(Bidirectional(LSTM(50, return_sequences=False)))
169+
model.add(Dropout(0.5))
170+
model.add(Dense(20, activation='relu'))
171+
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
172+
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
173+
#print(model.summary())
174+
return model
175+
176+
#confidences to binary
177+
def conf_to_pred(y):
178+
179+
if type(y) == list:
180+
y_class = []
181+
for pred in y:
182+
if pred < 0.5:
183+
y_class.append(0)
184+
else:
185+
y_class.append(1)
186+
return y_class
187+
188+
else:
189+
y_class = np.zeros(y.shape)
190+
for i in range(y.shape[0]):
191+
if y[i] < 0.5:
192+
y_class[i] = 0
193+
else:
194+
y_class[i] = 1
195+
return y_class
196+
197+
198+
199+
200+

0 commit comments

Comments
 (0)