susajsnair
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎code/1_data_process.py
+14 b/‎code/1_data_process.py
+14
diff --git a/‎code/2_train_eval.py
+44 b/‎code/2_train_eval.py
+44
diff --git a/‎code/__pycache__/config.cpython-36.pyc
514 Bytes b/‎code/__pycache__/config.cpython-36.pyc
514 Bytes
diff --git a/‎code/__pycache__/methods.cpython-36.pyc
4.8 KB b/‎code/__pycache__/methods.cpython-36.pyc
4.8 KB
diff --git a/‎code/__pycache__/nlp_aug.cpython-36.pyc
5.29 KB b/‎code/__pycache__/nlp_aug.cpython-36.pyc
5.29 KB
diff --git a/‎code/config.py
+13 b/‎code/config.py
+13
diff --git a/‎code/methods.py
+200 b/‎code/methods.py
+200
@@ -0,0 +1,3 @@
+datasets*
+word2vec*
+
@@ -0,0 +1,14 @@
+from methods import *
+from config import *
+
+if __name__ == "__main__":
+
+	#generate the augmented data sets
+	print("augmenting data")
+	gen_standard_aug(train_orig, train_aug_st)
+	print("done")
+
+	#generate the vocab dictionary
+	print("generating word 2 vec")
+	gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)
+	print("done")
@@ -0,0 +1,44 @@
+from config import * 
+from methods import *
+from numpy.random import seed
+seed(0)
+
+def run_model(train_file, test_file, percent_dataset, epochs_base):
+
+	#initialize model
+	model = build_model(input_size, word2vec_len)
+
+	#load data
+	word2vec = load_pickle(word2vec_pickle)
+	train_x, train_y = get_x_y(train_file, word2vec_len, input_size, word2vec, percent_dataset)
+	print("loaded data with shape:", train_x.shape, train_y.shape)
+	test_x, test_y = get_x_y(test_file, word2vec_len, input_size, word2vec, 1)
+
+	#train model
+	n_epochs = min(500, int(epochs_base/percent_dataset))
+	model.fit(train_x, train_y, batch_size=1024, epochs=n_epochs, validation_split=0.1, shuffle=True, verbose=0)
+	#model = load_model('lol')
+	#model.save('lol')
+
+	#evaluate model
+	y_pred = model.predict(test_x)
+	y_pred_binary = conf_to_pred(y_pred)
+	acc = accuracy_score(test_y, y_pred_binary)
+
+	#return the accuracy
+	print(train_file, test_file, percent_dataset, acc)
+	return acc
+
+if __name__ == "__main__":
+
+	#get the accuracy at each increment
+	orig_accs = []
+	aug_accs = []
+
+	for increment in increments:
+		orig_acc = run_model(train_orig, test_path, increment, epochs_base=5)
+		orig_accs.append(orig_acc)
+		aug_acc = run_model(train_aug_st, test_path, increment, epochs_base=2)
+		aug_accs.append(aug_acc)
+
+	print(orig_accs, aug_accs)
@@ -0,0 +1,13 @@
+
+
+#user inputs
+dataset_folder = 'datasets/subj'
+input_size = 50 #number of words
+huge_word2vec = 'word2vec/glove.840B.300d.txt'
+word2vec_len = 300
+word2vec_pickle = dataset_folder + '/word2vec.p' # don't want to load the huge pickle every time, so just save the words that are actually used into a smaller dictionary
+increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
+
+train_orig = dataset_folder + '/train_orig.txt'
+train_aug_st = dataset_folder + '/train_aug_st.txt'
+test_path = dataset_folder + '/test.txt'
@@ -0,0 +1,200 @@
+
+from keras.layers.core import Dense, Activation, Dropout
+from keras.layers.recurrent import LSTM
+from keras.layers import Bidirectional
+from keras.models import Sequential
+from keras.models import load_model
+
+from sklearn.utils import shuffle
+from sklearn.metrics import accuracy_score
+
+import math
+import time
+import numpy as np
+import random
+from random import randint
+random.seed( 3 )
+import datetime, re, operator
+from random import shuffle
+
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #get rid of warnings
+from os import listdir
+from os.path import isfile, join, isdir
+import pickle
+
+from nlp_aug import *
+
+###################################################
+######### loading folders and txt files ###########
+###################################################
+
+#loading a pickle file
+def load_pickle(file):
+	return pickle.load(open(file, 'rb'))
+
+#create an output folder if it does not already exist
+def confirm_output_folder(output_folder):
+	if not os.path.exists(output_folder):
+	    os.makedirs(output_folder)
+
+#get full image paths
+def get_txt_paths(folder):
+    txt_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.txt' in f]
+    if join(folder, '.DS_Store') in txt_paths:
+        txt_paths.remove(join(folder, '.DS_Store'))
+    txt_paths = sorted(txt_paths)
+    return txt_paths
+
+#get subfolders
+def get_subfolder_paths(folder):
+    subfolder_paths = [join(folder, f) for f in listdir(folder) if (isdir(join(folder, f)) and '.DS_Store' not in f)]
+    if join(folder, '.DS_Store') in subfolder_paths:
+        subfolder_paths.remove(join(folder, '.DS_Store'))
+    subfolder_paths = sorted(subfolder_paths)
+    return subfolder_paths
+
+#get all image paths
+def get_all_txt_paths(master_folder):
+
+    all_paths = []
+    subfolders = get_subfolder_paths(master_folder)
+    if len(subfolders) > 1:
+        for subfolder in subfolders:
+            all_paths += get_txt_paths(subfolder)
+    else:
+        all_paths = get_txt_paths(master_folder)
+    return all_paths
+
+###################################################
+################ data processing ##################
+###################################################
+
+#get the pickle file for the vocab so you don't have to load the entire dictionary
+def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
+
+    vocab = set()
+    text_embeddings = open(huge_word2vec, 'r').readlines()
+    word2vec = {}
+
+    #get all the vocab
+    all_txt_paths = get_all_txt_paths(folder)
+    print(all_txt_paths)
+
+    #loop through each text file
+    for txt_path in all_txt_paths:
+
+    	# get all the words
+    	try:
+    		all_lines = open(txt_path, "r").readlines()
+    		for line in all_lines:
+    			words = line[:-1].split(' ')
+    			for word in words:
+    			    vocab.add(word)
+    	except:
+    		print(txt_path, "has an error")
+    
+    print(len(vocab), "unique words found")
+
+    # load the word embeddings, and only add the word to the dictionary if we need it
+    for line in text_embeddings:
+        items = line.split(' ')
+        word = items[0]
+        if word in vocab:
+            vec = items[1:]
+            word2vec[word] = np.asarray(vec, dtype = 'float32')
+    print(len(word2vec), "matches between unique words and word2vec dictionary")
+        
+    pickle.dump(word2vec, open(output_pickle_path, 'wb'))
+    print("dictionaries outputted to", output_pickle_path)
+
+#generate more data with standard augmentation
+def gen_standard_aug(train_orig, output_file):
+	writer = open(output_file, 'w')
+	lines = open(train_orig, 'r').readlines()
+	for i, line in enumerate(lines):
+		parts = line[:-1].split('\t')
+		label = parts[0]
+		sentence = parts[1]
+		aug_sentences = standard_augmentation(sentence)
+		for aug_sentence in aug_sentences:
+			writer.write(label + "\t" + aug_sentence + '\n')
+	writer.close()
+
+
+#getting the x and y inputs in numpy array form from the text file
+def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
+
+	#read in lines
+	train_lines = open(train_txt, 'r').readlines()
+	shuffle(train_lines)
+	train_lines = train_lines[:int(percent_dataset*len(train_lines))]
+	num_lines = len(train_lines)
+
+	#initialize x and y matrix
+	x_matrix = np.zeros((num_lines, input_size, word2vec_len))
+	y_matrix = np.zeros((num_lines))
+
+	#insert values
+	for i, line in enumerate(train_lines):
+
+		parts = line[:-1].split('\t')
+		label = int(parts[0])
+		sentence = parts[1]	
+
+		#insert x
+		words = sentence.split(' ')
+		words = words[:x_matrix.shape[1]] #cut off if too long
+		for j, word in enumerate(words):
+			if word in word2vec:
+				x_matrix[i, j, :] = word2vec[word]
+
+		#insert y
+		y_matrix[i] = label
+
+	return x_matrix, y_matrix
+
+
+###################################################
+##################### model #######################
+###################################################
+
+#building the model in keras
+def build_model(sentence_length, word2vec_len):
+	model = None
+	model = Sequential()
+	model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
+	model.add(Dropout(0.5))
+	model.add(Bidirectional(LSTM(50, return_sequences=False)))
+	model.add(Dropout(0.5))
+	model.add(Dense(20, activation='relu'))
+	model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
+	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+	#print(model.summary())
+	return model
+
+#confidences to binary
+def conf_to_pred(y):
+
+    if type(y) == list:
+        y_class = []
+        for pred in y:
+            if pred < 0.5:
+                y_class.append(0)
+            else:
+                y_class.append(1)
+        return y_class
+
+    else:
+        y_class = np.zeros(y.shape)
+        for i in range(y.shape[0]):
+            if y[i] < 0.5:
+                y_class[i] = 0
+            else:
+                y_class[i] = 1
+        return y_class
+
+
+
+
+