Skip to content

Commit a956ef3

Browse files
committed
add trec dataset
1 parent 8505ac7 commit a956ef3

13 files changed

+97
-63
lines changed

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
datasets*
22
word2vec*
3-
3+
raw*
4+
checkpoints*
5+
paper*

code/2_train_eval.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,31 @@
33
from numpy.random import seed
44
seed(0)
55

6-
def run_model(train_file, test_file, percent_dataset, epochs_base):
6+
def run_model(train_file, test_file, num_classes, percent_dataset, epochs_base):
77

88
#initialize model
9-
model = build_model(input_size, word2vec_len)
9+
model = build_model(input_size, word2vec_len, num_classes)
1010

1111
#load data
1212
word2vec = load_pickle(word2vec_pickle)
13-
train_x, train_y = get_x_y(train_file, word2vec_len, input_size, word2vec, percent_dataset)
13+
train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
1414
print("loaded data with shape:", train_x.shape, train_y.shape)
15-
test_x, test_y = get_x_y(test_file, word2vec_len, input_size, word2vec, 1)
15+
test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)
1616

1717
#train model
1818
n_epochs = min(500, int(epochs_base/percent_dataset))
1919
model.fit(train_x, train_y, batch_size=1024, epochs=n_epochs, validation_split=0.1, shuffle=True, verbose=0)
20-
#model = load_model('lol')
21-
#model.save('lol')
20+
#model.save('checkpoints/lol')
21+
#model = load_model('checkpoints/lol')
2222

2323
#evaluate model
2424
y_pred = model.predict(test_x)
25-
y_pred_binary = conf_to_pred(y_pred)
26-
acc = accuracy_score(test_y, y_pred_binary)
25+
test_y_cat = one_hot_to_categorical(test_y)
26+
y_pred_cat = one_hot_to_categorical(y_pred)
27+
acc = accuracy_score(test_y_cat, y_pred_cat)
2728

2829
#return the accuracy
29-
print(train_file, test_file, percent_dataset, acc)
30+
print('train_file', train_file, 'test file', test_file, 'with fraction', percent_dataset, 'had accuracy', acc)
3031
return acc
3132

3233
if __name__ == "__main__":
@@ -36,9 +37,13 @@ def run_model(train_file, test_file, percent_dataset, epochs_base):
3637
aug_accs = []
3738

3839
for increment in increments:
39-
orig_acc = run_model(train_orig, test_path, increment, epochs_base=5)
40+
orig_acc = run_model(train_orig, test_path, num_classes, increment, epochs_base=20)
4041
orig_accs.append(orig_acc)
41-
aug_acc = run_model(train_aug_st, test_path, increment, epochs_base=2)
42+
aug_acc = run_model(train_aug_st, test_path, num_classes, increment, epochs_base=2)
4243
aug_accs.append(aug_acc)
4344

45+
#testing
46+
#run_model(train_orig, test_path, num_classes, 1, epochs_base=2)
47+
48+
4449
print(orig_accs, aug_accs)
35 Bytes
Binary file not shown.
-91 Bytes
Binary file not shown.
11 Bytes
Binary file not shown.

code/config.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,25 @@
1+
#user inputs
12

3+
#dataset folder
4+
dataset_folder = 'datasets/trec'
25

3-
#user inputs
4-
dataset_folder = 'datasets/subj'
5-
input_size = 50 #number of words
6+
#number of output classes
7+
num_classes = 6
8+
9+
#number of words for input
10+
input_size = 50
11+
12+
#dataset increments
13+
increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
14+
15+
#word2vec dictionary
616
huge_word2vec = 'word2vec/glove.840B.300d.txt'
717
word2vec_len = 300
818
word2vec_pickle = dataset_folder + '/word2vec.p' # don't want to load the huge pickle every time, so just save the words that are actually used into a smaller dictionary
9-
increments = [0.001, 0.003, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
1019

20+
#pre-existing file locations
1121
train_orig = dataset_folder + '/train_orig.txt'
12-
train_aug_st = dataset_folder + '/train_aug_st.txt'
1322
test_path = dataset_folder + '/test.txt'
23+
24+
#files to be created
25+
train_aug_st = dataset_folder + '/train_aug_st.txt'

code/methods.py

+31-46
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
from keras.layers.core import Dense, Activation, Dropout
32
from keras.layers.recurrent import LSTM
43
from keras.layers import Bidirectional
@@ -13,7 +12,7 @@
1312
import numpy as np
1413
import random
1514
from random import randint
16-
random.seed( 3 )
15+
random.seed(3)
1716
import datetime, re, operator
1817
from random import shuffle
1918

@@ -23,6 +22,7 @@
2322
from os.path import isfile, join, isdir
2423
import pickle
2524

25+
#import data augmentation methods
2626
from nlp_aug import *
2727

2828
###################################################
@@ -70,7 +70,7 @@ def get_all_txt_paths(master_folder):
7070
################ data processing ##################
7171
###################################################
7272

73-
#get the pickle file for the vocab so you don't have to load the entire dictionary
73+
#get the pickle file for the word2vec so you don't have to load the entire huge file each time
7474
def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
7575

7676
vocab = set()
@@ -108,22 +108,8 @@ def gen_vocab_dicts(folder, output_pickle_path, huge_word2vec):
108108
pickle.dump(word2vec, open(output_pickle_path, 'wb'))
109109
print("dictionaries outputted to", output_pickle_path)
110110

111-
#generate more data with standard augmentation
112-
def gen_standard_aug(train_orig, output_file):
113-
writer = open(output_file, 'w')
114-
lines = open(train_orig, 'r').readlines()
115-
for i, line in enumerate(lines):
116-
parts = line[:-1].split('\t')
117-
label = parts[0]
118-
sentence = parts[1]
119-
aug_sentences = standard_augmentation(sentence)
120-
for aug_sentence in aug_sentences:
121-
writer.write(label + "\t" + aug_sentence + '\n')
122-
writer.close()
123-
124-
125111
#getting the x and y inputs in numpy array form from the text file
126-
def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
112+
def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_dataset):
127113

128114
#read in lines
129115
train_lines = open(train_txt, 'r').readlines()
@@ -133,7 +119,7 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
133119

134120
#initialize x and y matrix
135121
x_matrix = np.zeros((num_lines, input_size, word2vec_len))
136-
y_matrix = np.zeros((num_lines))
122+
y_matrix = np.zeros((num_lines, num_classes))
137123

138124
#insert values
139125
for i, line in enumerate(train_lines):
@@ -150,50 +136,49 @@ def get_x_y(train_txt, word2vec_len, input_size, word2vec, percent_dataset):
150136
x_matrix[i, j, :] = word2vec[word]
151137

152138
#insert y
153-
y_matrix[i] = label
139+
y_matrix[i][label] = 1.0
154140

155141
return x_matrix, y_matrix
156142

143+
###################################################
144+
############### data augmentation #################
145+
###################################################
146+
147+
#generate more data with standard augmentation
148+
def gen_standard_aug(train_orig, output_file):
149+
writer = open(output_file, 'w')
150+
lines = open(train_orig, 'r').readlines()
151+
for i, line in enumerate(lines):
152+
parts = line[:-1].split('\t')
153+
label = parts[0]
154+
sentence = parts[1]
155+
aug_sentences = standard_augmentation(sentence)
156+
for aug_sentence in aug_sentences:
157+
writer.write(label + "\t" + aug_sentence + '\n')
158+
writer.close()
157159

158160
###################################################
159161
##################### model #######################
160162
###################################################
161163

162164
#building the model in keras
163-
def build_model(sentence_length, word2vec_len):
165+
def build_model(sentence_length, word2vec_len, num_classes):
164166
model = None
165167
model = Sequential()
166-
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
168+
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
167169
model.add(Dropout(0.5))
168-
model.add(Bidirectional(LSTM(50, return_sequences=False)))
170+
model.add(Bidirectional(LSTM(128, return_sequences=False)))
169171
model.add(Dropout(0.5))
170172
model.add(Dense(20, activation='relu'))
171-
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
172-
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
173+
model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))
174+
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
173175
#print(model.summary())
174176
return model
175177

176-
#confidences to binary
177-
def conf_to_pred(y):
178-
179-
if type(y) == list:
180-
y_class = []
181-
for pred in y:
182-
if pred < 0.5:
183-
y_class.append(0)
184-
else:
185-
y_class.append(1)
186-
return y_class
187-
188-
else:
189-
y_class = np.zeros(y.shape)
190-
for i in range(y.shape[0]):
191-
if y[i] < 0.5:
192-
y_class[i] = 0
193-
else:
194-
y_class[i] = 1
195-
return y_class
196-
178+
#one hot to categorical
179+
def one_hot_to_categorical(y):
180+
assert len(y.shape) == 2
181+
return np.argmax(y, axis=1)
197182

198183

199184

code/nlp_aug.py

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ def get_only_chars(line):
5656
# Replace n words in the sentence with synonyms from wordnet
5757
########################################################################
5858

59+
#for the first time you use wordnet
5960
#import nltk
6061
#nltk.download('wordnet')
6162
from nltk.corpus import wordnet
@@ -190,6 +191,10 @@ def standard_augmentation(sentence, sr=3, rd=2, rs=2, ri=2, num=3):
190191
augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
191192
return augmented_sentences
192193

194+
########################################################################
195+
# Testing
196+
########################################################################
197+
193198
if __name__ == '__main__':
194199

195200
line = 'Hi. My name is Jason. I’m a third-year computer science major at Dartmouth College, interested in deep learning and computer vision. My advisor is Saeed Hassanpour. I’m currently working on deep learning for lung cancer classification.'

figures.xlsx

14.9 KB
Binary file not shown.
35 Bytes
Binary file not shown.

preprocess/trej_clean.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
from utils import *
3+
4+
class_name_to_num = {'DESC': 0, 'ENTY':1, 'ABBR':2, 'HUM': 3, 'LOC': 4, 'NUM': 5}
5+
6+
def clean(input_file, output_file):
7+
lines = open(input_file, 'r').readlines()
8+
writer = open(output_file, 'w')
9+
for line in lines:
10+
parts = line[:-1].split(' ')
11+
tag = parts[0].split(':')[0]
12+
class_num = class_name_to_num[tag]
13+
sentence = get_only_chars(' '.join(parts[1:]))
14+
print(tag, class_num, sentence)
15+
output_line = str(class_num) + '\t' + sentence
16+
writer.write(output_line + '\n')
17+
writer.close()
18+
19+
20+
if __name__ == "__main__":
21+
22+
clean('raw/trec/train_copy.txt', 'datasets/trec/train_orig.txt')
23+
clean('raw/trec/test_copy.txt', 'datasets/trec/test.txt')
24+

preprocess/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def get_only_chars(line):
99
clean_line = ""
1010

1111
line = line.lower()
12+
line = line.replace(" 's", " is") #replace hyphens with spaces
1213
line = line.replace("-", " ") #replace hyphens with spaces
1314
line = line.replace("\t", " ")
1415
line = line.replace("\n", " ")

~$figures.xlsx

165 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)