LSTM sentiment

vsmolyakov · vsmolyakov · commit 9ef4198983cc · 2016-12-13T19:29:45.000-05:00
LSTM sentiment
diff --git a/README.md b/README.md
@@ -3,6 +3,20 @@ Kaggle Competitions
 
 ### Description
 
+**Sentiment Prediction**
+
+The goal of this competition is to learn and predict sentiment of movie reviews from the Rotten Tomatoes dataset. LSTM recurrent neural net (RNN) was used to predict the sentiment based on pre-processed text. The pre-processing included tokenization, stop-word removal and word stemming. The word tokens were converted to token Ids sequences that were padded and used as an input to LSTM.
+
+<p align="center">
+<img src="https://github.com/vsmolyakov/kaggle/blob/master/sentiment/figures/LSTM_chain.png" />
+</p>
+
+The figure above shows LSTM architecture with three hidden units, highlighting its ability to process sequential data. 
+
+References:  
+*https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews*  
+
+
 **Denoising**
 
 In the document denoising competition the goal is to remove background noise as a preprocessing step for an Optical Character Recognition (OCR) system.
diff --git a/sentiment/figures/LSTM_chain.png b/sentiment/figures/LSTM_chain.png
diff --git a/sentiment/sentiment_kernel.py b/sentiment/sentiment_kernel.py
@@ -0,0 +1,93 @@
+import numpy as np
+import pandas as pd
+
+from gensim import corpora
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize 
+from nltk.stem import SnowballStemmer
+
+from keras.preprocessing import sequence
+from keras.utils import np_utils
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Embedding
+from keras.layers import LSTM
+
+np.random.seed(0)
+
+if __name__ == "__main__":
+
+    #load data
+    train_df = pd.read_csv('./data/sentiment/train.tsv', sep='\t', header=0)
+    test_df = pd.read_csv('./data/sentiment/test.tsv', sep='\t', header=0)
+
+    raw_docs_train = train_df['Phrase'].values
+    raw_docs_test = test_df['Phrase'].values
+    sentiment_train = train_df['Sentiment'].values
+    num_labels = len(np.unique(sentiment_train))
+
+    #text pre-processing
+    stop_words = set(stopwords.words('english'))
+    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+    stemmer = SnowballStemmer('english')
+
+    print "pre-processing train docs..."
+    processed_docs_train = []
+    for doc in raw_docs_train:
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_train.append(stemmed)
+   
+    print "pre-processing test docs..."
+    processed_docs_test = []
+    for doc in raw_docs_test:
+       tokens = word_tokenize(doc)
+       filtered = [word for word in tokens if word not in stop_words]
+       stemmed = [stemmer.stem(word) for word in filtered]
+       processed_docs_test.append(stemmed)
+
+    processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
+
+    dictionary = corpora.Dictionary(processed_docs_all)
+    dictionary_size = len(dictionary.keys())
+    print "dictionary size: ", dictionary_size 
+    #dictionary.save('dictionary.dict')
+    #corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
+
+    print "converting to token ids..."
+    word_id_train, word_id_len = [], []
+    for doc in processed_docs_train:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_train.append(word_ids)
+        word_id_len.append(len(word_ids))
+
+    word_id_test, word_ids = [], []
+    for doc in processed_docs_test:
+        word_ids = [dictionary.token2id[word] for word in doc]
+        word_id_test.append(word_ids)
+        word_id_len.append(len(word_ids))
+ 
+    seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
+
+    #pad sequences
+    word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
+    word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
+    y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)
+
+    #LSTM
+    print "fitting LSTM ..."
+    model = Sequential()
+    model.add(Embedding(dictionary_size, 128, dropout=0.2))
+    model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
+    model.add(Dense(num_labels))
+    model.add(Activation('softmax'))
+
+    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    model.fit(word_id_train, y_train_enc, nb_epoch=1, batch_size=256, verbose=1)
+
+    test_pred = model.predict_classes(word_id_test)
+
+    #make a submission
+    test_df['Sentiment'] = test_pred.reshape(-1,1) 
+    header = ['PhraseId', 'Sentiment']
+    test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)