Skip to content

Commit cd86fb2

Browse files
author
Latish Khubnani
committed
Completing the challenge
0 parents  commit cd86fb2

File tree

5 files changed

+259
-0
lines changed

5 files changed

+259
-0
lines changed

A1.pkl

186 Bytes
Binary file not shown.

A2.pkl

211 Bytes
Binary file not shown.

A3.pkl

869 KB
Binary file not shown.

HomerChallenge

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
# Author: Latish Khubnani
2+
# Date: Fri Feb 10 2017
3+
4+
from ast import literal_eval
5+
from datetime import datetime as dt
6+
from plotly import tools
7+
import numpy as np
8+
import pandas as pd
9+
import pymongo
10+
import pickle
11+
import plotly as py
12+
import plotly.graph_objs as go
13+
14+
mongo_db = { 'host': 'localhost', 'port': 27017, 'db': 'Homer', 'collection': 'DataEngineerSampleData' }
15+
client = pymongo.MongoClient(mongo_db['host'], mongo_db['port'])
16+
db = client[mongo_db['db']]
17+
collection = db[mongo_db['collection']]
18+
19+
20+
def processData(collection_name):
21+
"""
22+
This function reads the huge text file line by line and stores the dictionary in the database after adding
23+
date fields.
24+
:param collection_name: string
25+
"""
26+
if not isinstance(collection_name, str):
27+
print("Invalid name, using the string conversion of passed parameter")
28+
collection_name = str(collection_name)
29+
30+
global collection, db, mongo_db
31+
collection = db[mongo_db[collection_name]]
32+
days = { 0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday" }
33+
var = 0
34+
with open('DataEngineerSampleData.txt', encoding='utf-8') as f:
35+
for i in f:
36+
record = literal_eval(i.rstrip())
37+
dateTime = dt.fromtimestamp(record['_t'])
38+
record['Year'] = dateTime.date().year
39+
record['Month'] = dateTime.date().month
40+
record['Day'] = dateTime.date().day
41+
record['Week'] = dateTime.date().isocalendar()[1]
42+
record['DayOfWeek'] = days[dateTime.date().weekday()]
43+
record['Hour'] = dateTime.time().hour
44+
record['Minutes'] = dateTime.time().minute
45+
collection.insert(record)
46+
del record
47+
48+
# if you want to test the output with 100,000 records
49+
# var += 1
50+
# if var > 100000:
51+
# break
52+
53+
54+
def most_common_events(number_of_events=4):
55+
"""
56+
Finds the most occuring events and prints the results in form of data frame.
57+
:param number_of_events: Get top most occuring events during the sessions
58+
:return: Graph object trace
59+
"""
60+
pipeline = [
61+
{ "$group": { "_id": "$_n", "count": { "$sum": 1 } } },
62+
{ "$sort": { 'count': -1 } }
63+
]
64+
65+
event_counts = list(collection.aggregate(pipeline))
66+
event_counts.sort(key=lambda x: x['count'], reverse=True)
67+
if len(event_counts) > number_of_events:
68+
event_counts = event_counts[0:number_of_events]
69+
70+
output = open('A1.pkl', 'wb')
71+
pickle.dump(event_counts, output)
72+
output.close()
73+
event_counts = pd.DataFrame(event_counts)
74+
event_counts.rename(columns={ "_id": "Event Type" }, inplace=True)
75+
event_counts = event_counts[event_counts["Event Type"].notnull()]
76+
print("\n",event_counts)
77+
trace1 = go.Bar(x=event_counts["Event Type"], y=event_counts['count'])
78+
# py.offline.plot([trace1])
79+
return trace1
80+
81+
82+
def most_read_title(top_x=5):
83+
"""
84+
Function to query the mongo database to find most read titles and prints the results in form of data frame
85+
:param top_x: Top 'x' most read Titles, default Top 5 most read titles
86+
:return: Graph Object Trace
87+
"""
88+
most_read_title_pipeline = [
89+
{ "$match": { "_n": "open" } },
90+
{ "$group": { "_id": "$manuscripttitle", "count": { "$sum": 1 } } },
91+
{ "$sort": { 'count': -1 } }
92+
]
93+
94+
most_read_title_results = list(collection.aggregate(most_read_title_pipeline))
95+
most_read_title_results.sort(key=lambda x: x['count'], reverse=True)
96+
if len(most_read_title_results) > top_x:
97+
most_read_title_results = most_read_title_results[0:top_x]
98+
99+
print("\n Most read title results:\n",most_read_title_results)
100+
101+
output = open('A2.pkl', 'wb')
102+
pickle.dump(most_read_title_results, output)
103+
output.close()
104+
105+
most_read_title_results = pd.DataFrame(most_read_title_results)
106+
most_read_title_results.rename(columns={ "_id": "Lesson Title" }, inplace=True)
107+
# py.offline.plot([])
108+
trace = go.Bar(x=most_read_title_results["Lesson Title"], y=most_read_title_results["count"])
109+
110+
print("\n",most_read_title_results)
111+
112+
return trace
113+
114+
115+
def get_user_stats( user_id="V0DHBMUYQI"):
116+
"""
117+
118+
:param user_id: User Idenfication Alpha numeric string.
119+
"""
120+
# "_n": { "$in": ["open", "complete", "incomplete"] }
121+
user_activity = list(collection.find({ "_p": user_id },
122+
{ "_n": 1, "_p": 1, "_t": 1, "manuscriptid": 1, "Hour": 1, "Minutes": 1,
123+
"Day": 1, "Week": 1, "Month": 1, "Year": 1 }))
124+
df = pd.DataFrame(user_activity)
125+
df = df.drop_duplicates(subset=["_n", "_t"])
126+
127+
# Approach 1
128+
# Here we are calculating the time between the current event and the next one, an open event followed by opening the
129+
# menu will lead to recording the time till when the lesson was open i.e till when it was read. Here The key is that
130+
# anything different event followed by open means the user has stopped reading. so we don't need to find
131+
# corresponding 'complete' or incomplete tag to calculate the time spent. Later we just group it by day, week and
132+
# month and find mean. I guess this might be a problem in case of no event recorded and user closes the app.(?)
133+
134+
# Approach 2
135+
# according to problem statement we need to look for open along with complete / incomplete to check for total time
136+
# spent to complete the lesson. for that we will group by manuscript id and remove the duplicates then subtract the
137+
# time for that group from next record.
138+
139+
df = df[df["_n"].isin(["open", "complete", "incomplete"])]
140+
df["timeSpent"] = [dt.fromtimestamp(x) for x in df["_t"]]
141+
grouped_df = df.groupby('manuscriptid')["timeSpent"].apply(lambda x: x - x.shift())
142+
df["timeSpent"] = grouped_df.fillna(0)
143+
df["timeSpent"] = [np.abs(x.total_seconds()) / (60 * 60) for x in df["timeSpent"]]
144+
145+
# print(grouped_df)
146+
# print(df)
147+
148+
df.rename(columns={ 'timeSpent': "Minutes Spent" }, inplace=True)
149+
average_day_time_spent = pd.pivot_table(df, index=["Day", "Month", "Year"], values=["Minutes Spent"],
150+
aggfunc=[np.mean])
151+
152+
average_month_time_spent = pd.pivot_table(df, index=["Month", "Year"], values=["Minutes Spent"],
153+
aggfunc=[np.mean])
154+
average_week_timespent = pd.pivot_table(df, index=["Week", "Year"], values=["Minutes Spent"],
155+
aggfunc=[np.mean])
156+
157+
print("Average time spent by days", average_day_time_spent, "\n\nAverage time spent by Week", average_week_timespent,
158+
"\n\nAverage time spent by Month", average_month_time_spent)
159+
160+
output = open('A3.pkl', 'wb')
161+
pickle.dump(df, output)
162+
output.close()
163+
164+
165+
# all_plots = go.bar(average_day_time_spent)
166+
167+
168+
def main():
169+
Events = most_common_events(4)
170+
Titles = most_read_title(5)
171+
172+
fig = tools.make_subplots(rows=2, cols=1, subplot_titles=('1. Most Occuring Events', '2. Most Read Titles'))
173+
fig.append_trace(Events, 1, 1)
174+
fig.append_trace(Titles, 2, 1)
175+
fig['layout'].update(title='Challenge Answers', showlegend=False)
176+
py.offline.plot(fig)
177+
178+
print("For User IHL8FBBKTB")
179+
get_user_stats("IHL8FBBKTB")
180+
181+
182+
if __name__ == '__main__':
183+
main()
184+
185+
186+
# most occuring users
187+
# 0 IHL8FBBKTB 52861
188+
# 1 V0DHBMUYQI 51425
189+
# 2 JBMU3E6QRD 34759
190+
# 3 NPL04PA1NT 31596
191+
# 4 FZQM7F3WQU 30759
192+
# 5 FZGCAHO857 28796
193+
# 6 9PEE7EACG9 26288
194+
# 7 Z6JNHVGMFP 25354
195+
# 8 2PEP2P4NGD 22478
196+
# 9 084RJU8BZK 19157

temp-plot.html

Lines changed: 63 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)