This repository was archived by the owner on May 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain_NLTK.py
217 lines (165 loc) · 6.67 KB
/
main_NLTK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
'''
Python 3.6, Python NLTK model
This file contains the code required to test the various models under the Python NLTK model.
The results will be written into their individual output file in a CSV format.
Instructions to execute the file can be found at the bottom of the file.
'''
import csv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import tweetCleaner
import tweetProcesser
sentiment = SentimentIntensityAnalyzer()
def NLTKCleanRaw():
'''
Raw NLTK model
'''
tweet_counter = 0
with open("results_nltk_raw.txt","w", encoding = "utf-8") as postresults:
newWriter = csv.writer(postresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("raw_twitter.txt","r", encoding = "utf-8") as postprocessed:
for line in postprocessed.readlines():
total_score = 0
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.lowercase(line)
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
lines_list = tokenize.sent_tokenize(tweet)
for sentence in lines_list:
ss = sentiment.polarity_scores(sentence)
total_score -= ss["neg"]
total_score += ss["pos"]
total_score = round(total_score,3)
if total_score == 0:
newWriter.writerow([0, "neutral"])
elif total_score > 0:
newWriter.writerow([total_score, "positive"])
else:
newWriter.writerow([total_score, "negative"])
except:
newWriter.writerow([0, "neutral"])
print("ERROR processing tweet: {}".format(tweet_counter))
def NLTKCleanAbbrev():
"""
NLTK model with extended abbreviations
"""
tweet_counter = 0
tweetProcesser.abbreviation_extender()
with open("results_nltk_abbrev.txt","w", encoding = "utf-8") as postresults:
newWriter = csv.writer(postresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("abbreviations_twitter.txt","r", encoding = "utf-8") as postprocessed:
for line in postprocessed.readlines():
total_score = 0
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
lines_list = tokenize.sent_tokenize(tweet)
for sentence in lines_list:
ss = sentiment.polarity_scores(sentence)
total_score -= ss["neg"]
total_score += ss["pos"]
total_score = round(total_score,3)
if total_score == 0:
newWriter.writerow([0, "neutral"])
elif total_score > 0:
newWriter.writerow([total_score, "positive"])
else:
newWriter.writerow([total_score, "negative"])
except:
newWriter.writerow([0, "neutral"])
print("ERROR processing tweet: {}".format(tweet_counter))
def NLTKCleanEmoji():
"""
NLTK model with emoticon scoring
"""
tweet_counter = 0
with open("results_nltk_emoji.txt","w", encoding = "utf-8") as postresults:
newWriter = csv.writer(postresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("raw_twitter.txt","r", encoding = "utf-8") as postprocessed:
for line in postprocessed.readlines():
total_score = 0
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.lowercase(line)
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet,total_score = tweetProcesser.emoticon_score(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
lines_list = tokenize.sent_tokenize(tweet)
for sentence in lines_list:
ss = sentiment.polarity_scores(sentence)
total_score -= ss["neg"]
total_score += ss["pos"]
total_score = round(total_score,3)
if total_score == 0:
newWriter.writerow([0, "neutral"])
elif total_score > 0:
newWriter.writerow([total_score, "positive"])
else:
newWriter.writerow([total_score, "negative"])
except:
newWriter.writerow([0, "neutral"])
print("ERROR processing tweet: {}".format(tweet_counter))
def NLTKCleanAbbrevEmoji():
"""
NLTK model with extended abbreviations AND emoticon scoring
"""
tweet_counter = 0
tweetProcesser.abbreviation_extender()
with open("results_nltk_abbrev_emoji.txt","w", encoding = "utf-8") as postresults:
newWriter = csv.writer(postresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("abbreviations_twitter.txt","r", encoding = "utf-8") as postprocessed:
for line in postprocessed.readlines():
total_score = 0
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.lowercase(line)
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet,total_score = tweetProcesser.emoticon_score(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
lines_list = tokenize.sent_tokenize(tweet)
for line in lines_list:
ss = sentiment.polarity_scores(line)
total_score -= ss["neg"]
total_score += ss["pos"]
total_score = round(total_score,3)
if total_score == 0:
newWriter.writerow([0, "neutral"])
elif total_score > 0:
newWriter.writerow([total_score, "positive"])
else:
newWriter.writerow([total_score, "negative"])
except:
newWriter.writerow([0, "neutral"])
print("ERROR processing tweet: {}".format(tweet_counter))
print("====================TEST BEGIN=======================")
'''
BASIC: This is the main function we will be executing.
It combines all the cleaning and processing steps described in the GitHub README.
Run this script in your python command shell.
'''
NLTKCleanAbbrevEmoji()
'''
ADVANCED: Sometimes, performing excessive cleaning operations on the input may worsen the accuracy of the model.
Hence, here are several other models you may wish to test for accuracy comparison.
The description of the models may be found under the individual functions above.
To test a model, simply comment the above "Basic" model and uncomment any of the models below.
Run this script in your python command shell.
'''
#NLTKCleanRaw()
#NLTKCleanAbbrev()
#NLTKCleanEmoji()
print("====================TEST END=========================")