This repository was archived by the owner on May 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain_TextBlob.py
147 lines (123 loc) · 5.43 KB
/
main_TextBlob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
'''
Python 3.6, TextBlob model
This file contains the code required to test the various models under the TextBlob model.
The results will be written into their individual output file in a CSV format.
Instructions to execute the file can be found at the bottom of the file.
'''
import tweetProcesser
import tweetCleaner
import csv
from textblob import TextBlob
def TextBlobCleanRaw():
'''
Raw TextBlob model
Our current model uses the pre-cleaned raw TextBlob model.
'''
tweet_counter = 0
with open("results_textblob_raw.txt","w",encoding="utf-8") as preresults:
newWriter = csv.writer(preresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("raw_twitter.txt","r",encoding="utf-8") as preproccessed:
for line in preproccessed.readlines():
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.lowercase(line)
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
wiki = TextBlob(tweet)
normalized_score, sentiment_label = tweetProcesser.sentimentClassifier(wiki, 0)
newWriter.writerow([normalized_score, sentiment_label,tweet])
except:
newWriter.writerow(["0","neutral", "BLANK"])
print("ERROR processing tweet: {}".format(tweet_counter))
def TextBlobCleanAbbrev():
'''
TextBlob model with abbreviations extended.
'''
tweet_counter = 0
tweetProcesser.abbreviation_extender()
with open("results_textblob_abbrev.txt","w",encoding="utf-8") as postresults:
newWriter = csv.writer(postresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("abbreviations_twitter.txt","r",encoding="utf-8") as postprocessed:
for line in postprocessed.readlines():
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.StopWordRemover(line)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
wiki = TextBlob(tweet)
normalized_score, sentiment_label = tweetProcesser.sentimentClassifier(wiki, 0)
newWriter.writerow([normalized_score, sentiment_label, tweet])
except:
newWriter.writerow(["0","neutral", "BLANK"])
print("ERROR processing tweet: {}".format(tweet_counter))
def TextBlobCleanEmoji():
'''
TextBlob model with Emoticon scoring.
'''
tweet_counter = 0
with open("results_textblob_emoji.txt","w",encoding="utf-8") as preresults:
newWriter = csv.writer(preresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("raw_twitter.txt","r",encoding="utf-8") as preproccessed:
for line in preproccessed.readlines():
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.lowercase(line)
tweet = tweetCleaner.StopWordRemover(tweet)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet,score = tweetProcesser.emoticon_score(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
wiki = TextBlob(tweet)
normalized_score, sentiment_label = tweetProcesser.sentimentClassifier(wiki, score)
newWriter.writerow([normalized_score, sentiment_label, tweet])
except:
newWriter.writerow(["0","neutral", "ERROR"])
print("ERROR processing tweet: {}".format(tweet_counter))
def TextBlobCleanAbbrevEmoji():
'''
TextBlob model with Emoticon scoring and extended abbreviations.
'''
tweet_counter = 0
tweetProcesser.abbreviation_extender()
with open("results_textblob_abbrev_emoji.txt","w",encoding="utf-8") as preresults:
newWriter = csv.writer(preresults, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with open("abbreviations_twitter.txt","r",encoding="utf-8") as preproccessed:
for line in preproccessed.readlines():
tweet_counter += 1
try:
print("Processing tweet: {}".format(tweet_counter))
tweet = tweetCleaner.StopWordRemover(line)
tweet = tweetCleaner.removeSpecialChars(tweet)
tweet,score = tweetProcesser.emoticon_score(tweet)
tweet = tweetCleaner.removeAllNonAlpha(tweet)
tweet = tweetCleaner.lemmatizer(tweet)
wiki = TextBlob(tweet)
normalized_score, sentiment_label = tweetProcesser.sentimentClassifier(wiki, score)
newWriter.writerow([normalized_score, sentiment_label, tweet])
except:
newWriter.writerow(["0","neutral", "ERROR"])
print("ERROR processing tweet: {}".format(tweet_counter))
print("====================TEST BEGIN=======================")
'''
BASIC: This is the main function we will be executing.
It combines all the cleaning and processing steps described in the GitHub README.
Run this script in your python command shell.
'''
TextBlobCleanAbbrevEmoji()
'''
ADVANCED: Sometimes, performing excessive cleaning operations on the input may worsen the accuracy of the model.
Hence, here are several other models you may wish to test for accuracy comparison.
The description of the models may be found under the individual functions above.
To test a model, simply comment the above "Basic" model and uncomment any of the models below.
Run this script in your python command shell.
'''
#TextBlobCleanRaw()
#TextBlobCleanAbbrev()
#TextBlobCleanEmoji()
print("====================TEST END=========================")