susajsnair
diff --git a/‎.gitignore
+3-1 b/‎.gitignore
+3-1
diff --git a/‎code/2_train_eval.py
+16-11 b/‎code/2_train_eval.py
+16-11
diff --git a/‎code/__pycache__/config.cpython-36.pyc
35 Bytes b/‎code/__pycache__/config.cpython-36.pyc
35 Bytes
diff --git a/‎code/__pycache__/methods.cpython-36.pyc
-91 Bytes b/‎code/__pycache__/methods.cpython-36.pyc
-91 Bytes
diff --git a/‎code/__pycache__/nlp_aug.cpython-36.pyc
11 Bytes b/‎code/__pycache__/nlp_aug.cpython-36.pyc
11 Bytes
diff --git a/‎code/config.py
+17-5 b/‎code/config.py
+17-5
diff --git a/‎code/methods.py
+31-46 b/‎code/methods.py
+31-46
diff --git a/‎code/nlp_aug.py
+5 b/‎code/nlp_aug.py
+5
diff --git a/‎figures.xlsx
14.9 KB b/‎figures.xlsx
14.9 KB
diff --git a/‎preprocess/__pycache__/utils.cpython-36.pyc
35 Bytes b/‎preprocess/__pycache__/utils.cpython-36.pyc
35 Bytes
diff --git a/‎preprocess/trej_clean.py
+24 b/‎preprocess/trej_clean.py
+24
diff --git a/‎preprocess/utils.py
+1 b/‎preprocess/utils.py
+1
diff --git a/‎~$figures.xlsx
165 Bytes b/‎~$figures.xlsx
165 Bytes
@@ -1,3 +1,5 @@
 datasets*
 word2vec*
-
+raw*
+checkpoints*
+paper*
@@ -3,30 +3,31 @@
 from numpy.random import seed
 seed(0)
 
-def run_model(train_file, test_file, percent_dataset, epochs_base):
+def run_model(train_file, test_file, num_classes, percent_dataset, epochs_base):
 
 	#initialize model
-	model = build_model(input_size, word2vec_len)
+	model = build_model(input_size, word2vec_len, num_classes)
 
 	#load data
 	word2vec = load_pickle(word2vec_pickle)
-	train_x, train_y = get_x_y(train_file, word2vec_len, input_size, word2vec, percent_dataset)
+	train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
 	print("loaded data with shape:", train_x.shape, train_y.shape)
-	test_x, test_y = get_x_y(test_file, word2vec_len, input_size, word2vec, 1)
+	test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)
 
 	#train model
 	n_epochs = min(500, int(epochs_base/percent_dataset))
 	model.fit(train_x, train_y, batch_size=1024, epochs=n_epochs, validation_split=0.1, shuffle=True, verbose=0)
-	#model = load_model('lol')
-	#model.save('lol')
+	#model.save('checkpoints/lol')
+	#model = load_model('checkpoints/lol')
 
 	#evaluate model
 	y_pred = model.predict(test_x)
-	y_pred_binary = conf_to_pred(y_pred)
-	acc = accuracy_score(test_y, y_pred_binary)
+	test_y_cat = one_hot_to_categorical(test_y)
+	y_pred_cat = one_hot_to_categorical(y_pred)
+	acc = accuracy_score(test_y_cat, y_pred_cat)
 
 	#return the accuracy
-	print(train_file, test_file, percent_dataset, acc)
+	print('train_file', train_file, 'test file', test_file, 'with fraction', percent_dataset, 'had accuracy', acc)
 	return acc
 
 if __name__ == "__main__":
@@ -36,9 +37,13 @@ def run_model(train_file, test_file, percent_dataset, epochs_base):
 	aug_accs = []
 
 	for increment in increments:
-		orig_acc = run_model(train_orig, test_path, increment, epochs_base=5)
+		orig_acc = run_model(train_orig, test_path, num_classes, increment, epochs_base=20)
 		orig_accs.append(orig_acc)
-		aug_acc = run_model(train_aug_st, test_path, increment, epochs_base=2)
+		aug_acc = run_model(train_aug_st, test_path, num_classes, increment, epochs_base=2)
 		aug_accs.append(aug_acc)
 
+	#testing
+	#run_model(train_orig, test_path, num_classes, 1, epochs_base=2)
+
+
 	print(orig_accs, aug_accs)
@@ -1,13 +1,25 @@
+#user inputs
 
+#dataset folder
+dataset_folder = 'datasets/trec' 
 
-#user inputs
-dataset_folder = 'datasets/subj'
-input_size = 50 #number of words
+#number of output classes
+num_classes = 6
+
+#number of words for input
+input_size = 50 
+
+#dataset increments
+increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
+
+#word2vec dictionary
 huge_word2vec = 'word2vec/glove.840B.300d.txt'
 word2vec_len = 300
 word2vec_pickle = dataset_folder + '/word2vec.p' # don't want to load the huge pickle every time, so just save the words that are actually used into a smaller dictionary
-increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
 
+#pre-existing file locations
 train_orig = dataset_folder + '/train_orig.txt'
-train_aug_st = dataset_folder + '/train_aug_st.txt'
 test_path = dataset_folder + '/test.txt'
+
+#files to be created
+train_aug_st = dataset_folder + '/train_aug_st.txt'
@@ -1,4 +1,3 @@
-
 from keras.layers.core import Dense, Activation, Dropout
 from keras.layers.recurrent import LSTM
 from keras.layers import Bidirectional
@@ -13,7 +12,7 @@
 import numpy as np
 import random
 from random import randint
-random.seed( 3 )
+random.seed(3)
 import datetime, re, operator
 from random import shuffle
 
@@ -23,6 +22,7 @@
 from os.path import isfile, join, isdir
 import pickle
 
+#import data augmentation methods
 from nlp_aug import *
 
 ###################################################
@@ -70,7 +70,7 @@ def get_all_txt_paths(master_folder):
 ################ data processing ##################
 ###################################################
 
-#get the pickle file for the vocab so you don't have to load the entire dictionary
+#get the pickle file for the word2vec so you don't have to load the entire huge file each time
 def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
 
     vocab = set()
@@ -108,22 +108,8 @@ def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
     pickle.dump(word2vec, open(output_pickle_path, 'wb'))
     print("dictionaries outputted to", output_pickle_path)
 
-#generate more data with standard augmentation
-def gen_standard_aug(train_orig, output_file):
-	writer = open(output_file, 'w')
-	lines = open(train_orig, 'r').readlines()
-	for i, line in enumerate(lines):
-		parts = line[:-1].split('\t')
-		label = parts[0]
-		sentence = parts[1]
-		aug_sentences = standard_augmentation(sentence)
-		for aug_sentence in aug_sentences:
-			writer.write(label + "\t" + aug_sentence + '\n')
-	writer.close()
-
-
 #getting the x and y inputs in numpy array form from the text file
-def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
+def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_dataset):
 
 	#read in lines
 	train_lines = open(train_txt, 'r').readlines()
@@ -133,7 +119,7 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
 
 	#initialize x and y matrix
 	x_matrix = np.zeros((num_lines, input_size, word2vec_len))
-	y_matrix = np.zeros((num_lines))
+	y_matrix = np.zeros((num_lines, num_classes))
 
 	#insert values
 	for i, line in enumerate(train_lines):
@@ -150,50 +136,49 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
 				x_matrix[i, j, :] = word2vec[word]
 
 		#insert y
-		y_matrix[i] = label
+		y_matrix[i][label] = 1.0
 
 	return x_matrix, y_matrix
 
+###################################################
+############### data augmentation #################
+###################################################
+
+#generate more data with standard augmentation
+def gen_standard_aug(train_orig, output_file):
+    writer = open(output_file, 'w')
+    lines = open(train_orig, 'r').readlines()
+    for i, line in enumerate(lines):
+        parts = line[:-1].split('\t')
+        label = parts[0]
+        sentence = parts[1]
+        aug_sentences = standard_augmentation(sentence)
+        for aug_sentence in aug_sentences:
+            writer.write(label + "\t" + aug_sentence + '\n')
+    writer.close()
 
 ###################################################
 ##################### model #######################
 ###################################################
 
 #building the model in keras
-def build_model(sentence_length, word2vec_len):
+def build_model(sentence_length, word2vec_len, num_classes):
 	model = None
 	model = Sequential()
-	model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
+	model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
 	model.add(Dropout(0.5))
-	model.add(Bidirectional(LSTM(50, return_sequences=False)))
+	model.add(Bidirectional(LSTM(128, return_sequences=False)))
 	model.add(Dropout(0.5))
 	model.add(Dense(20, activation='relu'))
-	model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
-	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+	model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
+	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 	#print(model.summary())
 	return model
 
-#confidences to binary
-def conf_to_pred(y):
-
-    if type(y) == list:
-        y_class = []
-        for pred in y:
-            if pred < 0.5:
-                y_class.append(0)
-            else:
-                y_class.append(1)
-        return y_class
-
-    else:
-        y_class = np.zeros(y.shape)
-        for i in range(y.shape[0]):
-            if y[i] < 0.5:
-                y_class[i] = 0
-            else:
-                y_class[i] = 1
-        return y_class
-
+#one hot to categorical
+def one_hot_to_categorical(y):
+    assert len(y.shape) == 2
+    return np.argmax(y, axis=1)
 
 
 
 
@@ -56,6 +56,7 @@ def get_only_chars(line):
 # Replace n words in the sentence with synonyms from wordnet
 ########################################################################
 
+#for the first time you use wordnet
 #import nltk
 #nltk.download('wordnet')
 from nltk.corpus import wordnet 
@@ -190,6 +191,10 @@ def standard_augmentation(sentence, sr=3, rd=2, rs=2, ri=2, num=3):
 	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
 	return augmented_sentences
 
+########################################################################
+# Testing
+########################################################################
+
 if __name__ == '__main__':
 
 	line = 'Hi. My name is Jason. I’m a third-year computer science major at Dartmouth College, interested in deep learning and computer vision. My advisor is Saeed Hassanpour. I’m currently working on deep learning for lung cancer classification.'
 
@@ -0,0 +1,24 @@
+
+from utils import *
+
+class_name_to_num = {'DESC': 0, 'ENTY':1, 'ABBR':2, 'HUM': 3, 'LOC': 4, 'NUM': 5}
+
+def clean(input_file, output_file):
+	lines = open(input_file, 'r').readlines()
+	writer = open(output_file, 'w')
+	for line in lines:
+		parts = line[:-1].split(' ')
+		tag = parts[0].split(':')[0]
+		class_num = class_name_to_num[tag]
+		sentence = get_only_chars(' '.join(parts[1:]))
+		print(tag, class_num, sentence)
+		output_line = str(class_num) + '\t' + sentence
+		writer.write(output_line + '\n')
+	writer.close()
+
+
+if __name__ == "__main__":
+
+	clean('raw/trec/train_copy.txt', 'datasets/trec/train_orig.txt')
+	clean('raw/trec/test_copy.txt', 'datasets/trec/test.txt')
+	
@@ -9,6 +9,7 @@ def get_only_chars(line):
     clean_line = ""
 
     line = line.lower()
+    line = line.replace(" 's", " is") #replace hyphens with spaces
     line = line.replace("-", " ") #replace hyphens with spaces
     line = line.replace("\t", " ")
     line = line.replace("\n", " ")