-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmodel.py
91 lines (70 loc) · 3.09 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import NaiveBayesClassifier
import re, string
import random
import pickle
import pandas as pd
class Sentiment:
def __init__(self):
self.stop_words = stopwords.words('english')
self.positive_cleaned_tokens_list = []
self.negative_cleaned_tokens_list = []
self.positive_tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
self.negative_tweets_tokens = twitter_samples.tokenized('negative_tweets.json')
self.non_abusive = self.positive_tweets_tokens[:808] + self.negative_tweets_tokens[:811]
self.abusive_words = pd.read_csv('bad-words.csv')['jigaboo']
self.abusive = []
for word in self.abusive_words:
self.abusive.append(word)
def remove_noise(self, token_):
cleaned_tokens = []
for token, tag in pos_tag(token_):
'''
Here regex removes the unwanted hyperlinks and username preceded by @
'''
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token)>0 and token not in string.punctuation and token.lower() not in self.stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
def get_tweet_for_model(self, cleaned_tokens_list, sen):
if sen == 'p':
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
else:
for tweet_tokens in cleaned_tokens_list:
yield dict([tweet_tokens, True])
def preprocess_data(self):
for tokens in self.non_abusive:
self.positive_cleaned_tokens_list.append(self.remove_noise(tokens))
positive_tokens_for_model = self.get_tweet_for_model(self.positive_cleaned_tokens_list, 'p')
negative_tokens_for_model = []
for word in self.abusive:
dict_ = {word: True}
e = (dict_, 'Negative')
negative_tokens_for_model.append(e)
positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = negative_tokens_for_model
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
# Splitting data [70:30 ratio]
train_data = dataset[:6000]
return train_data
def train_data(self):
train_set = self.preprocess_data()
classifier = NaiveBayesClassifier.train(train_set)
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
return classifier