|
| 1 | +# Author: Latish Khubnani |
| 2 | +# Date: Fri Feb 10 2017 |
| 3 | + |
| 4 | +from ast import literal_eval |
| 5 | +from datetime import datetime as dt |
| 6 | +from plotly import tools |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | +import pymongo |
| 10 | +import pickle |
| 11 | +import plotly as py |
| 12 | +import plotly.graph_objs as go |
| 13 | + |
| 14 | +mongo_db = { 'host': 'localhost', 'port': 27017, 'db': 'Homer', 'collection': 'DataEngineerSampleData' } |
| 15 | +client = pymongo.MongoClient(mongo_db['host'], mongo_db['port']) |
| 16 | +db = client[mongo_db['db']] |
| 17 | +collection = db[mongo_db['collection']] |
| 18 | + |
| 19 | + |
| 20 | +def processData(collection_name): |
| 21 | + """ |
| 22 | + This function reads the huge text file line by line and stores the dictionary in the database after adding |
| 23 | + date fields. |
| 24 | + :param collection_name: string |
| 25 | + """ |
| 26 | + if not isinstance(collection_name, str): |
| 27 | + print("Invalid name, using the string conversion of passed parameter") |
| 28 | + collection_name = str(collection_name) |
| 29 | + |
| 30 | + global collection, db, mongo_db |
| 31 | + collection = db[mongo_db[collection_name]] |
| 32 | + days = { 0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 4: "Friday", 5: "Saturday", 6: "Sunday" } |
| 33 | + var = 0 |
| 34 | + with open('DataEngineerSampleData.txt', encoding='utf-8') as f: |
| 35 | + for i in f: |
| 36 | + record = literal_eval(i.rstrip()) |
| 37 | + dateTime = dt.fromtimestamp(record['_t']) |
| 38 | + record['Year'] = dateTime.date().year |
| 39 | + record['Month'] = dateTime.date().month |
| 40 | + record['Day'] = dateTime.date().day |
| 41 | + record['Week'] = dateTime.date().isocalendar()[1] |
| 42 | + record['DayOfWeek'] = days[dateTime.date().weekday()] |
| 43 | + record['Hour'] = dateTime.time().hour |
| 44 | + record['Minutes'] = dateTime.time().minute |
| 45 | + collection.insert(record) |
| 46 | + del record |
| 47 | + |
| 48 | + # if you want to test the output with 100,000 records |
| 49 | + # var += 1 |
| 50 | + # if var > 100000: |
| 51 | + # break |
| 52 | + |
| 53 | + |
| 54 | +def most_common_events(number_of_events=4): |
| 55 | + """ |
| 56 | + Finds the most occuring events and prints the results in form of data frame. |
| 57 | + :param number_of_events: Get top most occuring events during the sessions |
| 58 | + :return: Graph object trace |
| 59 | + """ |
| 60 | + pipeline = [ |
| 61 | + { "$group": { "_id": "$_n", "count": { "$sum": 1 } } }, |
| 62 | + { "$sort": { 'count': -1 } } |
| 63 | + ] |
| 64 | + |
| 65 | + event_counts = list(collection.aggregate(pipeline)) |
| 66 | + event_counts.sort(key=lambda x: x['count'], reverse=True) |
| 67 | + if len(event_counts) > number_of_events: |
| 68 | + event_counts = event_counts[0:number_of_events] |
| 69 | + |
| 70 | + output = open('A1.pkl', 'wb') |
| 71 | + pickle.dump(event_counts, output) |
| 72 | + output.close() |
| 73 | + event_counts = pd.DataFrame(event_counts) |
| 74 | + event_counts.rename(columns={ "_id": "Event Type" }, inplace=True) |
| 75 | + event_counts = event_counts[event_counts["Event Type"].notnull()] |
| 76 | + print("\n",event_counts) |
| 77 | + trace1 = go.Bar(x=event_counts["Event Type"], y=event_counts['count']) |
| 78 | + # py.offline.plot([trace1]) |
| 79 | + return trace1 |
| 80 | + |
| 81 | + |
| 82 | +def most_read_title(top_x=5): |
| 83 | + """ |
| 84 | + Function to query the mongo database to find most read titles and prints the results in form of data frame |
| 85 | + :param top_x: Top 'x' most read Titles, default Top 5 most read titles |
| 86 | + :return: Graph Object Trace |
| 87 | + """ |
| 88 | + most_read_title_pipeline = [ |
| 89 | + { "$match": { "_n": "open" } }, |
| 90 | + { "$group": { "_id": "$manuscripttitle", "count": { "$sum": 1 } } }, |
| 91 | + { "$sort": { 'count': -1 } } |
| 92 | + ] |
| 93 | + |
| 94 | + most_read_title_results = list(collection.aggregate(most_read_title_pipeline)) |
| 95 | + most_read_title_results.sort(key=lambda x: x['count'], reverse=True) |
| 96 | + if len(most_read_title_results) > top_x: |
| 97 | + most_read_title_results = most_read_title_results[0:top_x] |
| 98 | + |
| 99 | + print("\n Most read title results:\n",most_read_title_results) |
| 100 | + |
| 101 | + output = open('A2.pkl', 'wb') |
| 102 | + pickle.dump(most_read_title_results, output) |
| 103 | + output.close() |
| 104 | + |
| 105 | + most_read_title_results = pd.DataFrame(most_read_title_results) |
| 106 | + most_read_title_results.rename(columns={ "_id": "Lesson Title" }, inplace=True) |
| 107 | + # py.offline.plot([]) |
| 108 | + trace = go.Bar(x=most_read_title_results["Lesson Title"], y=most_read_title_results["count"]) |
| 109 | + |
| 110 | + print("\n",most_read_title_results) |
| 111 | + |
| 112 | + return trace |
| 113 | + |
| 114 | + |
| 115 | +def get_user_stats( user_id="V0DHBMUYQI"): |
| 116 | + """ |
| 117 | + |
| 118 | + :param user_id: User Idenfication Alpha numeric string. |
| 119 | + """ |
| 120 | + # "_n": { "$in": ["open", "complete", "incomplete"] } |
| 121 | + user_activity = list(collection.find({ "_p": user_id }, |
| 122 | + { "_n": 1, "_p": 1, "_t": 1, "manuscriptid": 1, "Hour": 1, "Minutes": 1, |
| 123 | + "Day": 1, "Week": 1, "Month": 1, "Year": 1 })) |
| 124 | + df = pd.DataFrame(user_activity) |
| 125 | + df = df.drop_duplicates(subset=["_n", "_t"]) |
| 126 | + |
| 127 | + # Approach 1 |
| 128 | + # Here we are calculating the time between the current event and the next one, an open event followed by opening the |
| 129 | + # menu will lead to recording the time till when the lesson was open i.e till when it was read. Here The key is that |
| 130 | + # anything different event followed by open means the user has stopped reading. so we don't need to find |
| 131 | + # corresponding 'complete' or incomplete tag to calculate the time spent. Later we just group it by day, week and |
| 132 | + # month and find mean. I guess this might be a problem in case of no event recorded and user closes the app.(?) |
| 133 | + |
| 134 | + # Approach 2 |
| 135 | + # according to problem statement we need to look for open along with complete / incomplete to check for total time |
| 136 | + # spent to complete the lesson. for that we will group by manuscript id and remove the duplicates then subtract the |
| 137 | + # time for that group from next record. |
| 138 | + |
| 139 | + df = df[df["_n"].isin(["open", "complete", "incomplete"])] |
| 140 | + df["timeSpent"] = [dt.fromtimestamp(x) for x in df["_t"]] |
| 141 | + grouped_df = df.groupby('manuscriptid')["timeSpent"].apply(lambda x: x - x.shift()) |
| 142 | + df["timeSpent"] = grouped_df.fillna(0) |
| 143 | + df["timeSpent"] = [np.abs(x.total_seconds()) / (60 * 60) for x in df["timeSpent"]] |
| 144 | + |
| 145 | + # print(grouped_df) |
| 146 | + # print(df) |
| 147 | + |
| 148 | + df.rename(columns={ 'timeSpent': "Minutes Spent" }, inplace=True) |
| 149 | + average_day_time_spent = pd.pivot_table(df, index=["Day", "Month", "Year"], values=["Minutes Spent"], |
| 150 | + aggfunc=[np.mean]) |
| 151 | + |
| 152 | + average_month_time_spent = pd.pivot_table(df, index=["Month", "Year"], values=["Minutes Spent"], |
| 153 | + aggfunc=[np.mean]) |
| 154 | + average_week_timespent = pd.pivot_table(df, index=["Week", "Year"], values=["Minutes Spent"], |
| 155 | + aggfunc=[np.mean]) |
| 156 | + |
| 157 | + print("Average time spent by days", average_day_time_spent, "\n\nAverage time spent by Week", average_week_timespent, |
| 158 | + "\n\nAverage time spent by Month", average_month_time_spent) |
| 159 | + |
| 160 | + output = open('A3.pkl', 'wb') |
| 161 | + pickle.dump(df, output) |
| 162 | + output.close() |
| 163 | + |
| 164 | + |
| 165 | + # all_plots = go.bar(average_day_time_spent) |
| 166 | + |
| 167 | + |
| 168 | +def main(): |
| 169 | + Events = most_common_events(4) |
| 170 | + Titles = most_read_title(5) |
| 171 | + |
| 172 | + fig = tools.make_subplots(rows=2, cols=1, subplot_titles=('1. Most Occuring Events', '2. Most Read Titles')) |
| 173 | + fig.append_trace(Events, 1, 1) |
| 174 | + fig.append_trace(Titles, 2, 1) |
| 175 | + fig['layout'].update(title='Challenge Answers', showlegend=False) |
| 176 | + py.offline.plot(fig) |
| 177 | + |
| 178 | + print("For User IHL8FBBKTB") |
| 179 | + get_user_stats("IHL8FBBKTB") |
| 180 | + |
| 181 | + |
| 182 | +if __name__ == '__main__': |
| 183 | + main() |
| 184 | + |
| 185 | + |
| 186 | + # most occuring users |
| 187 | + # 0 IHL8FBBKTB 52861 |
| 188 | + # 1 V0DHBMUYQI 51425 |
| 189 | + # 2 JBMU3E6QRD 34759 |
| 190 | + # 3 NPL04PA1NT 31596 |
| 191 | + # 4 FZQM7F3WQU 30759 |
| 192 | + # 5 FZGCAHO857 28796 |
| 193 | + # 6 9PEE7EACG9 26288 |
| 194 | + # 7 Z6JNHVGMFP 25354 |
| 195 | + # 8 2PEP2P4NGD 22478 |
| 196 | + # 9 084RJU8BZK 19157 |
0 commit comments