-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
136 lines (109 loc) · 4.27 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
'''Britney Tweets Searcher '''
# import modules
import base64
import io
from flask import Flask, request, render_template
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# make it a flask app
app = Flask(__name__)
def get_db():
""" Return a mongodb client """
client = MongoClient(
host='brit_mongodb',
port=27017,
username='root',
password='pass',
authSource="admin"
)
db = client["britney_db"]
return db
@app.route('/', methods =["GET"])
def brit_index():
""" return the index home page"""
return render_template('index.html')
@app.route('/result', methods =["POST"])
def brit_search():
""" Query the MongoDn and return a panda dataframe(df) """
"""
Answer the following:
- How many tweets were posted containing the term on each day?
- How many unique users posted a tweet containing the term?
- How many likes did tweets containing the term get, on average?
- Where (in terms of place IDs) did the tweets come from?
- What times of day were the tweets posted at?
- Which user posted the most tweets containing the term?
"""
# getting input with name = bsearch in HTML form
search_value = request.form.get('bsearch')
if search_value == '':
return render_template('index.html', message='Search Value required')
# get a dataframe from mongo query of search term
df = get_britney_df(search_value)
if df is None:
return render_template('index.html', message='No data returned')
# clean some data
# convert creaedt_at to datatime type
df['created_at'] = pd.to_datetime(df['created_at'],
infer_datetime_format=True,
errors='coerce',
utc=True)
df['like_count'] = pd.to_numeric(df['like_count'], errors='coerce')
# never used matplotlib before, but create a couple of charts
plt.figure(1)
fig, axs = plt.subplots(figsize=(12, 4))
df.groupby(df['created_at'].dt.hour).size().plot(
kind='bar', rot=0, ax=axs
)
axs.set_xlabel("Time of day (hours)", fontsize=12)
axs.set_ylabel("Tweet Count", fontsize=12)
axs.set_title('Tweet count by time of day', loc='left', y=0.85, x=0.02, fontsize='medium')
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
hour_plot_url = base64.b64encode(img.getvalue()).decode()
# tweet count by day
plt.figure(2)
figs, ax = plt.subplots(figsize=(12, 4))
df.groupby(df['created_at'].dt.date).size().plot(
kind='line', rot=0, ax=ax
)
ax.set_xlabel("Days", fontsize='medium')
ax.set_ylabel("Tweet Count", fontsize='medium')
ax.set_title('Tweet count by day', loc='left', y=0.85, x=0.02, fontsize='medium')
ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter(ax.xaxis.get_major_locator()))
# set font and rotation for date tick labels
plt.gcf().autofmt_xdate()
img2 = io.BytesIO()
plt.savefig(img2, format='png')
img2.seek(0)
day_plot_url = base64.b64encode(img2.getvalue()).decode()
# TODO: use twarc2 and convert place ID to city names
# pass the df and plots to results template
return render_template('result.html',
brit_df=df,
hourplot=hour_plot_url,
dayplot=day_plot_url)
def get_britney_df(search_val: str):
"""get a dataframe based on search value"""
try:
db = get_db()
# db.tweets.create_index( { "text": "text" } )
# create text search query
myquery = { "$text": { "$search": f"{search_val}" } }
#myquery = { "text": { "$regex": f".*{search_val}*." } }
# do query, returning used fields
tweets = db.tweets.find(myquery,
{"created_at":1, "author_handle":1,
"author_id":1, "like_count":1, "place_id":1, "text":1})
df = pd.DataFrame(list(tweets))
return df
except:
pass
finally:
if isinstance(db, MongoClient):
db.close()
if __name__=='__main__':
app.run(host="0.0.0.0", port=5000)