-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_text.py
115 lines (101 loc) · 3.94 KB
/
parse_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import re
import twitter_text
def parse_tweet(status, pos_emotes=[':\)', ':-\)', ': \)', ':D', \
'=\)', '\(:', '\(-:', '\( :', '\(=', ':\]', ':-\]', '=\]', \
'\[:', '\[-:', '\[=', ':o\)', ':3', ':c\)', '8\)', ':\}', ':^\)'], \
neg_emotes=[':\(', '>:\[', ':-c', ':c', ':-<', ':<', ':\{', \
'>.>', '<.<', '>.<', '>:\\\\', '>:/', ':-/', ':/', ':\\\\', '=/', \
'=\\\\', ':S', ':-\(',': \(', '\):', '\)-:', '\) :']):
"""Removes urls, usernames, and repeated letters. Looks for emoticons
and and finds tweet sentiment. Returns no message if it is a retweet,
otherwise, returns the parsed tweet, along with a dictionary of
urls, usernames, and hashtags, with a sentiment code. Positive
sentiment = 1, negative sentiment = 0, no sentiment = None."""
if 'RT' in status:
return None
ttext = twitter_text.TwitterText(status)
extr = ttext.extractor
urls = extr.extract_urls_with_indices()
hashtags = extr.extract_hashtags_with_indices()
usernames = extr.extract_mentioned_screen_names_with_indices()
(new_text, tweet_dic) = \
replace_with_words(status, urls + hashtags + usernames)
(new_text, positive, negative) = \
remove_emoticons(new_text, pos_emotes, neg_emotes)
if positive > 0 and negative > 0:
return None
elif positive > 0:
sentiment = 1
elif negative > 0:
sentiment = -1
else:
sentiment = 0
new_text = remove_multiple_repeats(new_text)
return (new_text, tweet_dic, sentiment)
def replace_with_words(text, dic_list):
"""Function that replaces usernames with 'USER', hashtags with
'HASHTAG', and urls with 'URL'."""
startend = []
tweet_dic = {'URL':[], 'HASHTAG':[], 'USER':[]}
for dic in dic_list:
if 'url' in dic:
value = dic['url']
replace = 'URL'
elif 'hashtag' in dic:
value = dic['hashtag']
replace = 'HASHTAG'
elif 'screen_name' in dic:
value = dic['screen_name']
replace = 'USER'
(start, end) = dic['indices']
startend.append((start, end, replace, value))
startend.sort()
newstr = ''
next_start = 0
for i in xrange(len(startend)):
tweet_dic[startend[i][2]].append(startend[i][3])
newstr += text[next_start:startend[i][0]] + startend[i][2]
next_start = startend[i][1]
newstr += text[next_start:]
return (newstr, tweet_dic)
def remove_multiple_repeats(text):
repeat_list = []
num_repeats = 0
for i in xrange(1, len(text)):
if text[i] == text[i-1]:
num_repeats += 1
if num_repeats > 1:
repeat_list.append(i)
else:
num_repeats = 0
indices = [text[i] for i in xrange(len(text)) if i not in repeat_list]
return ''.join(indices)
def remove_emoticons(text, pos_emotes, neg_emotes):
words = text.split()
delete = []
positive = 0
for emote in pos_emotes:
format_str = r'( ?%s | %s ?)' % (emote, emote)
(text, count) = re.subn(format_str, ' ', text)
if count > 0:
positive += 1
negative = 0
for emote in neg_emotes:
format_str = r'( ?%s | %s ?)' % (emote, emote)
(text, count) = re.subn(format_str, ' ', text)
if count > 0:
negative += 1
return (text, positive, negative)
def reformat_word(word):
"""Returns a string which strips away non alpha-numeric characters and
turns everything to lowercase. For example, the function would
turn "Hello!" into "hello"."""
word = word.lower()
return ''.join([s for s in word if s.isalnum()])
if __name__ == '__main__':
a = 'Rhema #Vaithianathan - Are Bad Managers To Blame For NZ\'s Poor Economic Performance? yhoo.it/ypM4yy : ) #via @scoopnz'
print parse_tweet(a)
b = 'hello :\ :/ my name is =/ =\ joooooohnnnmyname'
print parse_tweet(b)
a = '!!hello!?'
print reformat_word(a)