Skip to content

Commit 9ef4198

Browse files
committed
LSTM sentiment
LSTM sentiment
1 parent 9574acd commit 9ef4198

File tree

3 files changed

+107
-0
lines changed

3 files changed

+107
-0
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@ Kaggle Competitions
33

44
### Description
55

6+
**Sentiment Prediction**
7+
8+
The goal of this competition is to learn and predict sentiment of movie reviews from the Rotten Tomatoes dataset. LSTM recurrent neural net (RNN) was used to predict the sentiment based on pre-processed text. The pre-processing included tokenization, stop-word removal and word stemming. The word tokens were converted to token Ids sequences that were padded and used as an input to LSTM.
9+
10+
<p align="center">
11+
<img src="https://github.com/vsmolyakov/kaggle/blob/master/sentiment/figures/LSTM_chain.png" />
12+
</p>
13+
14+
The figure above shows LSTM architecture with three hidden units, highlighting its ability to process sequential data.
15+
16+
References:
17+
*https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews*
18+
19+
620
**Denoising**
721

822
In the document denoising competition the goal is to remove background noise as a preprocessing step for an Optical Character Recognition (OCR) system.

sentiment/figures/LSTM_chain.png

86.3 KB
Loading

sentiment/sentiment_kernel.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from gensim import corpora
5+
from nltk.corpus import stopwords
6+
from nltk.tokenize import word_tokenize
7+
from nltk.stem import SnowballStemmer
8+
9+
from keras.preprocessing import sequence
10+
from keras.utils import np_utils
11+
from keras.models import Sequential
12+
from keras.layers import Dense, Dropout, Activation, Embedding
13+
from keras.layers import LSTM
14+
15+
np.random.seed(0)
16+
17+
if __name__ == "__main__":
18+
19+
#load data
20+
train_df = pd.read_csv('./data/sentiment/train.tsv', sep='\t', header=0)
21+
test_df = pd.read_csv('./data/sentiment/test.tsv', sep='\t', header=0)
22+
23+
raw_docs_train = train_df['Phrase'].values
24+
raw_docs_test = test_df['Phrase'].values
25+
sentiment_train = train_df['Sentiment'].values
26+
num_labels = len(np.unique(sentiment_train))
27+
28+
#text pre-processing
29+
stop_words = set(stopwords.words('english'))
30+
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
31+
stemmer = SnowballStemmer('english')
32+
33+
print "pre-processing train docs..."
34+
processed_docs_train = []
35+
for doc in raw_docs_train:
36+
tokens = word_tokenize(doc)
37+
filtered = [word for word in tokens if word not in stop_words]
38+
stemmed = [stemmer.stem(word) for word in filtered]
39+
processed_docs_train.append(stemmed)
40+
41+
print "pre-processing test docs..."
42+
processed_docs_test = []
43+
for doc in raw_docs_test:
44+
tokens = word_tokenize(doc)
45+
filtered = [word for word in tokens if word not in stop_words]
46+
stemmed = [stemmer.stem(word) for word in filtered]
47+
processed_docs_test.append(stemmed)
48+
49+
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
50+
51+
dictionary = corpora.Dictionary(processed_docs_all)
52+
dictionary_size = len(dictionary.keys())
53+
print "dictionary size: ", dictionary_size
54+
#dictionary.save('dictionary.dict')
55+
#corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
56+
57+
print "converting to token ids..."
58+
word_id_train, word_id_len = [], []
59+
for doc in processed_docs_train:
60+
word_ids = [dictionary.token2id[word] for word in doc]
61+
word_id_train.append(word_ids)
62+
word_id_len.append(len(word_ids))
63+
64+
word_id_test, word_ids = [], []
65+
for doc in processed_docs_test:
66+
word_ids = [dictionary.token2id[word] for word in doc]
67+
word_id_test.append(word_ids)
68+
word_id_len.append(len(word_ids))
69+
70+
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
71+
72+
#pad sequences
73+
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
74+
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
75+
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)
76+
77+
#LSTM
78+
print "fitting LSTM ..."
79+
model = Sequential()
80+
model.add(Embedding(dictionary_size, 128, dropout=0.2))
81+
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
82+
model.add(Dense(num_labels))
83+
model.add(Activation('softmax'))
84+
85+
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
86+
model.fit(word_id_train, y_train_enc, nb_epoch=1, batch_size=256, verbose=1)
87+
88+
test_pred = model.predict_classes(word_id_test)
89+
90+
#make a submission
91+
test_df['Sentiment'] = test_pred.reshape(-1,1)
92+
header = ['PhraseId', 'Sentiment']
93+
test_df.to_csv('./lstm_sentiment.csv', columns=header, index=False, header=True)

0 commit comments

Comments
 (0)