diff --git a/doc/filtering_observed_arrivals.ipynb b/doc/filtering_observed_arrivals.ipynb index c265acd..f14f93e 100644 --- a/doc/filtering_observed_arrivals.ipynb +++ b/doc/filtering_observed_arrivals.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -393,13 +393,15 @@ "|2 | 62 |\n", "| 4 | 6 |\n", "\n", - "So the " + "So the `train_id` isn't quite unique per trip, but something weird is happening to have more train ids than actual trains..." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [ { "ename": "NameError", @@ -423,20 +425,6 @@ "one_train = pandasql.read_sql(sql, con)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": 5, @@ -6803,6 +6791,13 @@ " & ((train_136['train_message'] != 'Delayed') | (train_136['timint'] < 1.0 ))]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Minute-Resolution Data" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -7616,7 +7611,1486 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The top \"trips\" are from Bloor-Spadina to Yonge via St. George and vice-versa, but these are the stations on line 2..." + "The top \"trips\" are from Bloor-Spadina to Yonge via St. George and vice-versa, but these are the stations on line 2... I think we are getting data for the wrong line because the line id is actually assigned by the data pulling script. So we need to filter the data by line based on `subwayline` field in the `ntas_data` table." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Re-examining Line 1 by Filtering on Stations\n", + "Re-doing this entire processing by using the `subwayline` from the `ntas_data` table instead" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Recreate a day of data\n", + "sql = '''DROP MATERIALIZED VIEW IF EXISTS test_day CASCADE; \n", + "CREATE MATERIALIZED VIEW test_day AS \n", + "SELECT requestid, stationid, line_id, create_date, request_date, station_char, subwayline, system_message_type, \n", + " timint, traindirection, trainid, train_message\n", + "FROM requests_serverless\n", + "INNER JOIN ntas_data_serverless USING (requestid)\n", + "INNER JOIN stations ON stationid = station_id\n", + "WHERE request_date >= '2019-07-17'::DATE + interval '5 hours' \n", + "AND request_date < '2019-07-17'::DATE + interval '29 hours' \n", + "''' \n", + "with con:\n", + " with con.cursor() as cur:\n", + " cur.execute(sql)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Re-checking the number of unique trains" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subwaylineNumber of trains in a day
0BD48
1SHEP4
2YUS169
\n", + "
" + ], + "text/plain": [ + " subwayline Number of trains in a day\n", + "0 BD 48\n", + "1 SHEP 4\n", + "2 YUS 169" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = ''' SELECT subwayline, COUNT(DISTINCT trainid) AS \"Number of trains in a day\"\n", + " FROM test_day\n", + " GROUP BY subwayline'''\n", + "pandasql.read_sql(sql, con)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Double-checking stations and lines are correctly mapped" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subwaylinestation_charstationid
0BDBAT146
1BDBAT246
2BDBAU149
3BDBAU249
4YUSBLO150
5YUSBLO122
6YUSBLO222
7YUSBLO250
8BDBRD153
9BDBRD253
10BDBSP19
11BDBSP147
12BDBSP29
13BDBSP247
14SHEPBSS166
15SHEPBSS266
16SHEPBYV165
17SHEPBYV265
18BDCFK152
19BDCFK252
20BDCHE154
21BDCHE254
22BDCHR145
23BDCHR245
24YUSCOL120
25YUSCOL220
26BDCOX158
27BDCOX258
28YUSCVL16
29YUSCVL26
............
134YUSSUM124
135YUSSUM224
136YUSUNI116
137YUSUNI216
138YUSVMC180
139YUSVMC280
140BDVPK161
141BDVPK261
142BDWAR162
143BDWAR262
144BDWDB159
145BDWDB259
146YUSWEL121
147YUSWEL221
148YUSWIL12
149YUSWIL22
150SHEPYIE130
151SHEPYIE164
152SHEPYIE264
153SHEPYIE230
154YUSYKD13
155YUSYKD23
156YUSYKM129
157YUSYKM229
158BDYNG150
159BDYNG122
160BDYNG250
161BDYNG222
162YUSYUN177
163YUSYUN277
\n", + "

164 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " subwayline station_char stationid\n", + "0 BD BAT1 46\n", + "1 BD BAT2 46\n", + "2 BD BAU1 49\n", + "3 BD BAU2 49\n", + "4 YUS BLO1 50\n", + "5 YUS BLO1 22\n", + "6 YUS BLO2 22\n", + "7 YUS BLO2 50\n", + "8 BD BRD1 53\n", + "9 BD BRD2 53\n", + "10 BD BSP1 9\n", + "11 BD BSP1 47\n", + "12 BD BSP2 9\n", + "13 BD BSP2 47\n", + "14 SHEP BSS1 66\n", + "15 SHEP BSS2 66\n", + "16 SHEP BYV1 65\n", + "17 SHEP BYV2 65\n", + "18 BD CFK1 52\n", + "19 BD CFK2 52\n", + "20 BD CHE1 54\n", + "21 BD CHE2 54\n", + "22 BD CHR1 45\n", + "23 BD CHR2 45\n", + "24 YUS COL1 20\n", + "25 YUS COL2 20\n", + "26 BD COX1 58\n", + "27 BD COX2 58\n", + "28 YUS CVL1 6\n", + "29 YUS CVL2 6\n", + ".. ... ... ...\n", + "134 YUS SUM1 24\n", + "135 YUS SUM2 24\n", + "136 YUS UNI1 16\n", + "137 YUS UNI2 16\n", + "138 YUS VMC1 80\n", + "139 YUS VMC2 80\n", + "140 BD VPK1 61\n", + "141 BD VPK2 61\n", + "142 BD WAR1 62\n", + "143 BD WAR2 62\n", + "144 BD WDB1 59\n", + "145 BD WDB2 59\n", + "146 YUS WEL1 21\n", + "147 YUS WEL2 21\n", + "148 YUS WIL1 2\n", + "149 YUS WIL2 2\n", + "150 SHEP YIE1 30\n", + "151 SHEP YIE1 64\n", + "152 SHEP YIE2 64\n", + "153 SHEP YIE2 30\n", + "154 YUS YKD1 3\n", + "155 YUS YKD2 3\n", + "156 YUS YKM1 29\n", + "157 YUS YKM2 29\n", + "158 BD YNG1 50\n", + "159 BD YNG1 22\n", + "160 BD YNG2 50\n", + "161 BD YNG2 22\n", + "162 YUS YUN1 77\n", + "163 YUS YUN2 77\n", + "\n", + "[164 rows x 3 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = '''SELECT DISTINCT subwayline, station_char, stationid\n", + "FROM test_day\n", + "ORDER BY station_char'''\n", + "pandasql.read_sql(sql, con)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Seems ok. Rerunning the other processing queries based on that" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "split_trips = '''\n", + " CREATE SEQUENCE IF NOT EXISTS trip_ids;\n", + " CREATE MATERIALIZED VIEW test_day_w_trips AS\n", + " SELECT trainid, subwayline, traindirection, stationid, station_char, create_date, create_date + timint * interval '1 minute' AS expected_arrival, timint, train_message,\n", + " CASE traindirection WHEN lag(traindirection) OVER w THEN currval('trip_ids') ELSE nextval('trip_ids') END AS trip_id\n", + " FROM test_day\n", + " WHERE (timint < 1 OR train_message = 'AtStation') \n", + " WINDOW w AS (PARTITION BY subwayline, trainid ORDER BY create_date + timint * interval '1 minute') \n", + " '''\n", + "with con:\n", + " with con.cursor() as cur:\n", + " cur.execute(split_trips)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A final step is to group together multiple observations at a same station, during a same trip, to get an approximation of arrival and \"departure\" time." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "final_step = ''' DROP MATERIALIZED VIEW IF EXISTS test_day_final;\n", + "CREATE MATERIALIZED VIEW test_day_final AS \n", + "SELECT trainid, subwayline, traindirection, LEFT(station_char, -1) AS station, trip_id,\n", + " MIN(expected_arrival) AS estimated_arrival, MAX(expected_arrival) AS estimated_departure,\n", + " CASE (ARRAY_AGG(train_message ORDER BY expected_arrival))[1] WHEN 'AtStation' THEN 1 ELSE 0 END AS exact_arr, \n", + "CASE (ARRAY_AGG(train_message ORDER BY expected_arrival DESC))[1] WHEN 'AtStation' THEN 1 ELSE 0 END AS exact_dep\n", + " FROM test_day_w_trips \n", + " GROUP BY trainid, subwayline, traindirection, station, trip_id \n", + "'''\n", + "with con:\n", + " with con.cursor() as cur:\n", + " cur.execute(final_step)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lineidNumber of observed tripsNumber of scheduled trips
011116747
12741704
24558457
\n", + "
" + ], + "text/plain": [ + " lineid Number of observed trips Number of scheduled trips\n", + "0 1 1116 747\n", + "1 2 741 704\n", + "2 4 558 457" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = ''' WITH observed_trips AS \n", + " (SELECT CASE subwayline\n", + " WHEN 'YUS' THEN '1'\n", + " WHEN 'BD' THEN '2'\n", + " WHEN 'SHEP' THEN '4'\n", + " END as lineid, \n", + " COUNT(DISTINCT trip_id) AS \"Number of observed trips\"\n", + " FROM test_day_final\n", + " GROUP BY lineid)\n", + " , unique_trips AS(SELECT route_short_name AS lineid, COUNT(DISTINCT trip_id) AS \"Number of scheduled trips\"\n", + " FROM gtfs.routes -- ON lineid::TEXT = route_short_name\n", + " INNER JOIN gtfs.trips USING (route_id)\n", + " INNER JOIN gtfs.calendar USING (service_id)\n", + " WHERE monday AND route_type = 1 AND route_short_name != '3'\n", + " GROUP BY route_short_name)\n", + " \n", + " SELECT *\n", + " FROM observed_trips\n", + " INNER JOIN unique_trips USING (lineid)\n", + " ORDER BY lineid'''\n", + "pandasql.read_sql(sql, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "sql = '''WITH inferred_trips AS(SELECT CASE subwayline\n", + " WHEN 'YUS' THEN 1\n", + " WHEN 'BD' THEN 2\n", + " WHEN 'SHEP' THEN 4\n", + " END as lineid, trip_id, COUNT(1) as stops\n", + "FROM test_day_final\n", + "GROUP BY lineid, trip_id\n", + "),\n", + "inferred_trip_length AS( SELECT lineid, stops, COUNT(trip_id) as obs_trips\n", + "FROM inferred_trips\n", + "GROUP BY lineid, stops)\n", + ",\n", + "gtfs_trip_lengths AS(SELECT route_short_name::INT AS lineid, trip_id, COUNT(1) as stops\n", + " FROM gtfs.stop_times \n", + " INNER JOIN gtfs.trips USING (trip_id)\n", + " INNER JOIN gtfs.routes USING (route_id)\n", + " INNER JOIN gtfs.calendar USING (service_id)\n", + " WHERE monday AND route_type = 1 AND route_short_name != '3'\n", + " GROUP BY route_short_name, trip_id\n", + ")\n", + ",gtfs_trip_length_distro AS (SELECT lineid, stops, COUNT(trip_id) as num_trips\n", + "FROM gtfs_trip_lengths\n", + "GROUP BY lineid, stops)\n", + "\n", + "SELECT lineid, stops, COALESCE(num_trips,0) as scheduled, COUNT(inferred_trips.trip_id) as observed \n", + "FROM inferred_trips\n", + "FULL OUTER JOIN gtfs_trip_length_distro USING (lineid, stops)\n", + "GROUP BY lineid, stops, num_trips\n", + "ORDER BY lineid, stops\n", + "'''\n", + "trip_lengths = pandasql.read_sql(sql, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "line_one = trip_lengths[trip_lengths['lineid'] == 1]\n", + "fig, ax = plt.subplots(figsize=(16,9))\n", + "line_one.plot(x='stops', y='scheduled', kind='bar', ax=ax,position=0, color='red')\n", + "line_one.plot(x='stops', y='observed', sharey=True, sharex=True, kind='bar', ax=ax, position=1, color='blue')\n", + "ax.set_title('Line 1 Distribution of Trip Lengths')\n", + "ax.yaxis.set_label('Number of trips')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "line_one = trip_lengths[trip_lengths['lineid'] == 2]\n", + "fig, ax = plt.subplots(figsize=(16,9))\n", + "line_one.plot(x='stops', y='scheduled', kind='bar', ax=ax,position=0, color='red')\n", + "line_one.plot(x='stops', y='observed', sharey=True, sharex=True, kind='bar', ax=ax, position=1, color='blue')\n", + "ax.set_title('Line 2 Distribution of Trip Lengths')\n", + "ax.yaxis.set_label('Number of trips')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "line_one = trip_lengths[trip_lengths['lineid'] == 4]\n", + "fig, ax = plt.subplots(figsize=(16,9))\n", + "line_one.plot(x='stops', y='scheduled', kind='bar', ax=ax,position=0, color='red')\n", + "line_one.plot(x='stops', y='observed', sharey=True, sharex=True, kind='bar', ax=ax, position=1, color='blue')\n", + "ax.set_title('Line 4 Distribution of Trip Lengths')\n", + "ax.yaxis.set_label('Number of trips')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I've managed to take out a solid chunk of trips that are too short, at the expense of generating a number of trips with more stops than are on that line...\n", + "What does one of those look like?" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
trainidsubwaylinetraindirectionstation_chartrip_idestimated_arrivalestimated_departureexact_arrexact_dep
0155YUSNorthVMC1178032019-07-17 18:51:04.0000002019-07-17 18:52:04.00000011
1155YUSNorthHWY1178032019-07-17 18:54:05.0000002019-07-17 18:54:05.00000011
2155YUSNorthPVL1178032019-07-17 18:56:07.0000002019-07-17 18:56:07.00000011
3155YUSNorthYUN1178032019-07-17 18:58:01.0000002019-07-17 18:58:01.00000011
4155YUSNorthFIW1178032019-07-17 19:00:03.0000002019-07-17 19:00:03.00000011
5155YUSNorthDNP1178032019-07-17 19:03:02.0000002019-07-17 19:03:02.00000011
6155YUSNorthSHW1178032019-07-17 19:04:59.5851432019-07-17 19:04:59.58514300
7155YUSNorthWIL1178032019-07-17 19:07:58.4971432019-07-17 19:07:58.49714300
8155YUSNorthYKD1178032019-07-17 19:09:47.0000002019-07-17 19:09:47.00000011
9155YUSNorthLWW1178032019-07-17 19:12:32.5154292019-07-17 19:12:47.00000001
10155YUSNorthGCN1178032019-07-17 19:14:41.8491432019-07-17 19:14:41.84914300
11155YUSNorthCVL1178032019-07-17 19:16:20.5737142019-07-17 19:16:20.57371400
12155YUSNorthSCW1178032019-07-17 19:19:50.0000002019-07-17 19:19:50.00000011
13155YUSNorthDUP1178032019-07-17 19:22:28.6285712019-07-17 19:22:28.62857100
14155YUSNorthSPA1178032019-07-17 19:23:24.2902862019-07-17 19:23:48.00000001
15155YUSNorthSGU1178032019-07-17 19:25:05.9760002019-07-17 19:25:17.00000001
16155YUSNorthMUS1178032019-07-17 19:26:49.3520002019-07-17 19:26:59.14400000
17155YUSNorthQPK1178032019-07-17 19:28:52.0000002019-07-17 19:28:52.00000011
18155YUSNorthSTP1178032019-07-17 19:30:18.0068572019-07-17 19:30:18.00685700
19155YUSNorthOSG1178032019-07-17 19:31:15.8582862019-07-17 19:31:15.85828600
20155YUSNorthSTA1178032019-07-17 19:32:08.9366862019-07-17 19:32:08.93668600
21155YUSNorthUNI1178032019-07-17 19:33:33.2336002019-07-17 19:33:52.00000001
22155YUSNorthKNG1178032019-07-17 19:34:40.6189942019-07-17 19:35:53.00000001
23155YUSNorthQUN1178032019-07-17 19:36:45.5174632019-07-17 19:36:55.00000001
24155YUSNorthDUN1178032019-07-17 19:37:43.1283432019-07-17 19:37:55.00000001
25155YUSNorthCOL1178032019-07-17 19:38:48.0603662019-07-17 19:38:55.00000001
26155YUSNorthWEL1178032019-07-17 19:39:51.1954512019-07-17 19:40:54.00000001
27155YUSNorthBLO1178032019-07-17 19:41:11.4596802019-07-17 19:42:16.00000001
28155YUSNorthROS1178032019-07-17 19:43:52.5504912019-07-17 19:43:52.55049100
29155YUSNorthSUM1178032019-07-17 19:44:44.7926402019-07-17 19:44:57.00000001
30155YUSNorthSTC1178032019-07-17 19:46:45.3992692019-07-17 19:46:45.39926900
31155YUSNorthDAV1178032019-07-17 19:47:59.0000002019-07-17 19:47:59.00000011
32155YUSNorthEGL1178032019-07-17 19:50:20.9804342019-07-17 19:51:59.00000001
33155YUSNorthLAW1178032019-07-17 19:55:18.7060112019-07-17 19:55:59.00000001
34155YUSNorthYKM1178032019-07-17 19:59:07.0000002019-07-17 19:59:07.00000011
35155YUSNorthSHP1178032019-07-17 20:02:37.0781262019-07-17 20:03:22.00000001
36155YUSNorthNYC1178032019-07-17 20:05:18.4562292019-07-17 20:06:00.00000001
37155YUSNorthFIN1178032019-07-17 20:09:45.0900112019-07-17 20:10:35.63972600
38155YUSNorthFIN2178032019-07-17 20:09:45.0900112019-07-17 20:10:35.63972600
\n", + "
" + ], + "text/plain": [ + " trainid subwayline traindirection station_char trip_id \\\n", + "0 155 YUS North VMC1 17803 \n", + "1 155 YUS North HWY1 17803 \n", + "2 155 YUS North PVL1 17803 \n", + "3 155 YUS North YUN1 17803 \n", + "4 155 YUS North FIW1 17803 \n", + "5 155 YUS North DNP1 17803 \n", + "6 155 YUS North SHW1 17803 \n", + "7 155 YUS North WIL1 17803 \n", + "8 155 YUS North YKD1 17803 \n", + "9 155 YUS North LWW1 17803 \n", + "10 155 YUS North GCN1 17803 \n", + "11 155 YUS North CVL1 17803 \n", + "12 155 YUS North SCW1 17803 \n", + "13 155 YUS North DUP1 17803 \n", + "14 155 YUS North SPA1 17803 \n", + "15 155 YUS North SGU1 17803 \n", + "16 155 YUS North MUS1 17803 \n", + "17 155 YUS North QPK1 17803 \n", + "18 155 YUS North STP1 17803 \n", + "19 155 YUS North OSG1 17803 \n", + "20 155 YUS North STA1 17803 \n", + "21 155 YUS North UNI1 17803 \n", + "22 155 YUS North KNG1 17803 \n", + "23 155 YUS North QUN1 17803 \n", + "24 155 YUS North DUN1 17803 \n", + "25 155 YUS North COL1 17803 \n", + "26 155 YUS North WEL1 17803 \n", + "27 155 YUS North BLO1 17803 \n", + "28 155 YUS North ROS1 17803 \n", + "29 155 YUS North SUM1 17803 \n", + "30 155 YUS North STC1 17803 \n", + "31 155 YUS North DAV1 17803 \n", + "32 155 YUS North EGL1 17803 \n", + "33 155 YUS North LAW1 17803 \n", + "34 155 YUS North YKM1 17803 \n", + "35 155 YUS North SHP1 17803 \n", + "36 155 YUS North NYC1 17803 \n", + "37 155 YUS North FIN1 17803 \n", + "38 155 YUS North FIN2 17803 \n", + "\n", + " estimated_arrival estimated_departure exact_arr exact_dep \n", + "0 2019-07-17 18:51:04.000000 2019-07-17 18:52:04.000000 1 1 \n", + "1 2019-07-17 18:54:05.000000 2019-07-17 18:54:05.000000 1 1 \n", + "2 2019-07-17 18:56:07.000000 2019-07-17 18:56:07.000000 1 1 \n", + "3 2019-07-17 18:58:01.000000 2019-07-17 18:58:01.000000 1 1 \n", + "4 2019-07-17 19:00:03.000000 2019-07-17 19:00:03.000000 1 1 \n", + "5 2019-07-17 19:03:02.000000 2019-07-17 19:03:02.000000 1 1 \n", + "6 2019-07-17 19:04:59.585143 2019-07-17 19:04:59.585143 0 0 \n", + "7 2019-07-17 19:07:58.497143 2019-07-17 19:07:58.497143 0 0 \n", + "8 2019-07-17 19:09:47.000000 2019-07-17 19:09:47.000000 1 1 \n", + "9 2019-07-17 19:12:32.515429 2019-07-17 19:12:47.000000 0 1 \n", + "10 2019-07-17 19:14:41.849143 2019-07-17 19:14:41.849143 0 0 \n", + "11 2019-07-17 19:16:20.573714 2019-07-17 19:16:20.573714 0 0 \n", + "12 2019-07-17 19:19:50.000000 2019-07-17 19:19:50.000000 1 1 \n", + "13 2019-07-17 19:22:28.628571 2019-07-17 19:22:28.628571 0 0 \n", + "14 2019-07-17 19:23:24.290286 2019-07-17 19:23:48.000000 0 1 \n", + "15 2019-07-17 19:25:05.976000 2019-07-17 19:25:17.000000 0 1 \n", + "16 2019-07-17 19:26:49.352000 2019-07-17 19:26:59.144000 0 0 \n", + "17 2019-07-17 19:28:52.000000 2019-07-17 19:28:52.000000 1 1 \n", + "18 2019-07-17 19:30:18.006857 2019-07-17 19:30:18.006857 0 0 \n", + "19 2019-07-17 19:31:15.858286 2019-07-17 19:31:15.858286 0 0 \n", + "20 2019-07-17 19:32:08.936686 2019-07-17 19:32:08.936686 0 0 \n", + "21 2019-07-17 19:33:33.233600 2019-07-17 19:33:52.000000 0 1 \n", + "22 2019-07-17 19:34:40.618994 2019-07-17 19:35:53.000000 0 1 \n", + "23 2019-07-17 19:36:45.517463 2019-07-17 19:36:55.000000 0 1 \n", + "24 2019-07-17 19:37:43.128343 2019-07-17 19:37:55.000000 0 1 \n", + "25 2019-07-17 19:38:48.060366 2019-07-17 19:38:55.000000 0 1 \n", + "26 2019-07-17 19:39:51.195451 2019-07-17 19:40:54.000000 0 1 \n", + "27 2019-07-17 19:41:11.459680 2019-07-17 19:42:16.000000 0 1 \n", + "28 2019-07-17 19:43:52.550491 2019-07-17 19:43:52.550491 0 0 \n", + "29 2019-07-17 19:44:44.792640 2019-07-17 19:44:57.000000 0 1 \n", + "30 2019-07-17 19:46:45.399269 2019-07-17 19:46:45.399269 0 0 \n", + "31 2019-07-17 19:47:59.000000 2019-07-17 19:47:59.000000 1 1 \n", + "32 2019-07-17 19:50:20.980434 2019-07-17 19:51:59.000000 0 1 \n", + "33 2019-07-17 19:55:18.706011 2019-07-17 19:55:59.000000 0 1 \n", + "34 2019-07-17 19:59:07.000000 2019-07-17 19:59:07.000000 1 1 \n", + "35 2019-07-17 20:02:37.078126 2019-07-17 20:03:22.000000 0 1 \n", + "36 2019-07-17 20:05:18.456229 2019-07-17 20:06:00.000000 0 1 \n", + "37 2019-07-17 20:09:45.090011 2019-07-17 20:10:35.639726 0 0 \n", + "38 2019-07-17 20:09:45.090011 2019-07-17 20:10:35.639726 0 0 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = '''WITH inferred_trips AS(SELECT trip_id, COUNT(1) as stops\n", + "FROM test_day_final\n", + "GROUP BY trip_id\n", + "HAVING COUNT(1) > 38\n", + "LIMIT 1)\n", + "SELECT test_day_final.* \n", + "FROM test_day_final\n", + "INNER JOIN inferred_trips USING (trip_id)\n", + "ORDER BY estimated_arrival'''\n", + "pandasql.read_sql(sql, con)" ] }, {