|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | + |
| 4 | +from gensim import corpora |
| 5 | +from nltk.corpus import stopwords |
| 6 | +from nltk.tokenize import word_tokenize |
| 7 | +from nltk.stem import SnowballStemmer |
| 8 | + |
| 9 | +from keras.preprocessing import sequence |
| 10 | +from keras.utils import np_utils |
| 11 | +from keras.models import Sequential |
| 12 | +from keras.layers import Dense, Dropout, Activation, Embedding |
| 13 | +from keras.layers import LSTM |
| 14 | + |
| 15 | +np.random.seed(0) |
| 16 | + |
| 17 | +if __name__ == "__main__": |
| 18 | + |
| 19 | + #load data |
| 20 | + train_df = pd.read_csv('./data/sentiment/train.tsv', sep='\t', header=0) |
| 21 | + test_df = pd.read_csv('./data/sentiment/test.tsv', sep='\t', header=0) |
| 22 | + |
| 23 | + raw_docs_train = train_df['Phrase'].values |
| 24 | + raw_docs_test = test_df['Phrase'].values |
| 25 | + sentiment_train = train_df['Sentiment'].values |
| 26 | + num_labels = len(np.unique(sentiment_train)) |
| 27 | + |
| 28 | + #text pre-processing |
| 29 | + stop_words = set(stopwords.words('english')) |
| 30 | + stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) |
| 31 | + stemmer = SnowballStemmer('english') |
| 32 | + |
| 33 | + print "pre-processing train docs..." |
| 34 | + processed_docs_train = [] |
| 35 | + for doc in raw_docs_train: |
| 36 | + tokens = word_tokenize(doc) |
| 37 | + filtered = [word for word in tokens if word not in stop_words] |
| 38 | + stemmed = [stemmer.stem(word) for word in filtered] |
| 39 | + processed_docs_train.append(stemmed) |
| 40 | + |
| 41 | + print "pre-processing test docs..." |
| 42 | + processed_docs_test = [] |
| 43 | + for doc in raw_docs_test: |
| 44 | + tokens = word_tokenize(doc) |
| 45 | + filtered = [word for word in tokens if word not in stop_words] |
| 46 | + stemmed = [stemmer.stem(word) for word in filtered] |
| 47 | + processed_docs_test.append(stemmed) |
| 48 | + |
| 49 | + processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0) |
| 50 | + |
| 51 | + dictionary = corpora.Dictionary(processed_docs_all) |
| 52 | + dictionary_size = len(dictionary.keys()) |
| 53 | + print "dictionary size: ", dictionary_size |
| 54 | + #dictionary.save('dictionary.dict') |
| 55 | + #corpus = [dictionary.doc2bow(doc) for doc in processed_docs] |
| 56 | + |
| 57 | + print "converting to token ids..." |
| 58 | + word_id_train, word_id_len = [], [] |
| 59 | + for doc in processed_docs_train: |
| 60 | + word_ids = [dictionary.token2id[word] for word in doc] |
| 61 | + word_id_train.append(word_ids) |
| 62 | + word_id_len.append(len(word_ids)) |
| 63 | + |
| 64 | + word_id_test, word_ids = [], [] |
| 65 | + for doc in processed_docs_test: |
| 66 | + word_ids = [dictionary.token2id[word] for word in doc] |
| 67 | + word_id_test.append(word_ids) |
| 68 | + word_id_len.append(len(word_ids)) |
| 69 | + |
| 70 | + seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int) |
| 71 | + |
| 72 | + #pad sequences |
| 73 | + word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len) |
| 74 | + word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len) |
| 75 | + y_train_enc = np_utils.to_categorical(sentiment_train, num_labels) |
| 76 | + |
| 77 | + #LSTM |
| 78 | + print "fitting LSTM ..." |
| 79 | + model = Sequential() |
| 80 | + model.add(Embedding(dictionary_size, 128, dropout=0.2)) |
| 81 | + model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) |
| 82 | + model.add(Dense(num_labels)) |
| 83 | + model.add(Activation('softmax')) |
| 84 | + |
| 85 | + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) |
| 86 | + model.fit(word_id_train, y_train_enc, nb_epoch=1, batch_size=256, verbose=1) |
| 87 | + |
| 88 | + test_pred = model.predict_classes(word_id_test) |
| 89 | + |
| 90 | + #make a submission |
| 91 | + test_df['Sentiment'] = test_pred.reshape(-1,1) |
| 92 | + header = ['PhraseId', 'Sentiment'] |
| 93 | + test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True) |
0 commit comments