-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlambda_function.py
132 lines (106 loc) · 4.69 KB
/
lambda_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import random
import tweepy
import json
import re
from pathlib import Path
import unidecode
from unidecode import unidecode
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from datetime import datetime
import nltk
import pandas as pd
import io
import pytz
import string
import time
ROOT = Path(__file__).resolve().parents[0]
nltk.data.path.append("/tmp")
nltk.download('stopwords',download_dir = "/tmp")
stopwords = nltk.corpus.stopwords.words('portuguese')
newStopWords = ['né','Se','q','vc','ter','ne','da','to','tô','https','tá',
'dar','te','eu','HTTPS','pra','tbm','tb','tt','ja','nao', 'et', 'fluminense', 'fluzao', 'vamos', 'dia', 'flu', 'sentinense', 'ffc', 'jogo', 'cara',
'ai','desse','quis','voce','vai','ta','ela','sobre','cada','ah','mas','mais', 'rs', 'contra', 'jogo', 'agr', 'mt', 'dnv', 'cr',
'pro','dela','vem','ja','outra','porque','por que','por quê','porquê','bem','rt','todo','tao','acho','sao','voces','pq',
'co','t','n','desde','so','mim','la','quer','fez','agora','aqui','vcs','gente','deu', 'ate', 'oq', 'ser', 'kkk','kk',
'kkkk','kkkkk','kkkkkk','kkkkkkkkk','kkkkk','kkkkkkk','kkkkkkkk','fazendo','estao','hoje','fazer','nessa','ainda','diz','pois','falando','disse','dessa','p','x']
stopwords.extend(newStopWords)
def remove_hashtag_and_mention(text):
entity_prefixes = ['@']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
def lambda_handler(event, context):
# Authentication
print("Get credentials")
consumer_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
consumer_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
access_token = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
access_token_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
print("Authenticate")
auth = tweepy.OAuth1UserHandler(
consumer_key, consumer_secret, access_token, access_token_secret
)
api = tweepy.API(auth,wait_on_rate_limit=True)
keyword = '#Fluminense OR #Flu OR Fluzão OR FFC OR Fluminense'
n_tweets = 200
tweet_list = []
# datetime object containing current date and time
tz = pytz.timezone('America/Sao_Paulo')
ct = datetime.now(tz=tz)
dt_string = ct.strftime("%d/%m/%Y %H:%M:%S")
# logger.info("Getting tweets")
for tweet in tweepy.Cursor(api.search_tweets,q=keyword,lang='pt',tweet_mode='extended').items(n_tweets):
tweet_list.append(unidecode(tweet.full_text))
#cleaning tweets
tw_list = pd.DataFrame(tweet_list)
tw_list.drop_duplicates(inplace=True)
tw_list['original'] = tw_list[0]
tw_list['text'] = tw_list[0]
# Lowercase
tw_list['text'] = tw_list.text.str.lower()
# Remove RT
remove_rt = lambda x: re.sub('rt @\w+: ', ' ', x)
# Remove tags
#tags = lambda x: re.sub(' /<[^>]+>/', ' ', x)
# Remove links
links = lambda x: re.sub(r'http\S+', ' ', x)
tw_list['text'] = tw_list.text.map(remove_rt)
# tw_list['text'] = tw_list.text.map(tags)
tw_list['text'] = tw_list.text.map(links)
# Remove hashtag and mention
tw_list['text'] = tw_list['text'].apply(lambda x: remove_hashtag_and_mention(x))
# Remove stopwords
tw_list['text'] = tw_list['text'].apply(lambda x: ' '.join([x.strip() for x in x.split() if x not in stopwords]))
#remove punctuation
table = str.maketrans('', '', string.punctuation)
tw_list['text'] = tw_list['text'].apply(lambda x: ' '.join([x.translate(table) for x in x.split()]))
# create a wordcloud
# logger.info("Generating WC")
wc = WordCloud(background_color='white',
collocations=False,
width=1600,
height=800,
colormap='tab10',
contour_width=3,
contour_color='black',
stopwords=stopwords).generate(tw_list['text'].str.cat(sep=' '))
plt.figure( figsize=(20 ,10), facecolor='k')
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
response = api.media_upload(filename="wordcloud", file=buf)
status = 'Fluminense: ' + dt_string + ' #FLU #Sentinense'
api.update_status(status = status ,media_ids=[response.media_id_string])