Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit b78b438

Browse files
committed
implement nd_time.get_contiguous_time_periods() #223
1 parent 1667072 commit b78b438

File tree

2 files changed

+54
-4
lines changed

2 files changed

+54
-4
lines changed

nowcasting_dataset/data_sources/data_source.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,7 @@ def get_contiguous_time_periods(self) -> pd.DataFrame:
170170
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
171171
"""
172172

173-
# TODO:
174-
# Modify nd_time.get_start_datetimes so that it returns the segment boundaries
175-
# as a pd.DataFrame?
173+
# TODO: Use nd_time.get_contiguous_time_periods()
176174
raise NotImplementedError()
177175

178176
def _get_time_slice(self, t0_dt: pd.Timestamp):

nowcasting_dataset/time.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
""" Time functions """
22
import logging
33
import warnings
4-
from typing import Iterable, Tuple, List
4+
from typing import Iterable, Tuple, List, Dict
55

66
import numpy as np
77
import pandas as pd
@@ -118,6 +118,7 @@ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) ->
118118
return all_intersecting_periods.sort_values(by="start_dt").reset_index(drop=True)
119119

120120

121+
# TODO: Delete this and its tests!
121122
def get_start_datetimes(
122123
datetimes: pd.DatetimeIndex, total_seq_len: int, max_gap: pd.Timedelta = THIRTY_MINUTES
123124
) -> pd.DatetimeIndex:
@@ -165,6 +166,57 @@ def get_start_datetimes(
165166
return pd.DatetimeIndex(np.concatenate(start_dt_index))
166167

167168

169+
# TODO: Write test!
170+
def get_contiguous_time_periods(
171+
datetimes: pd.DatetimeIndex, min_seq_len: int, max_gap: pd.Timedelta = THIRTY_MINUTES
172+
) -> pd.DataFrame:
173+
"""Returns a pd.DataFrame where each row records the boundary of a contiguous time periods.
174+
175+
Args:
176+
datetimes: The pd.DatetimeIndex of the timeseries. Must be sorted.
177+
min_seq_len: Sequences of min_seq_len or shorter will be discarded.
178+
max_gap: If any pair of consecutive `datetimes` is more than `max_gap` apart, then this pair
179+
of `datetimes` will be considered a "gap" between two contiguous sequences.
180+
181+
Returns:
182+
pd.DataFrame where each row represents a single time period. The pd.DataFrame
183+
has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime').
184+
"""
185+
# Sanity checks.
186+
assert len(datetimes) > 0
187+
assert min_seq_len > 1
188+
assert datetimes.is_monotonic_increasing()
189+
assert datetimes.is_unique()
190+
191+
# Find indices of gaps larger than max_gap:
192+
gap_mask = np.diff(datetimes) > max_gap
193+
gap_indices = np.argwhere(gap_mask)[:, 0]
194+
195+
# gap_indicies are the indices into dt_index for the timestep immediately
196+
# *before* the gap. e.g. if the datetimes at 12:00, 12:05, 18:00, 18:05
197+
# then gap_indicies will be [1]. So we add 1 to gap_indices to get
198+
# segment_boundaries, an index into dt_index which identifies the _start_
199+
# of each segment.
200+
segment_boundaries = gap_indices + 1
201+
202+
# Capture the last segment of dt_index.
203+
segment_boundaries = np.concatenate((segment_boundaries, [len(datetimes)]))
204+
205+
periods: List[Dict[str, pd.Timestamp]] = []
206+
start_i = 0
207+
for next_start_i in segment_boundaries:
208+
n_timesteps = next_start_i - start_i
209+
if n_timesteps > min_seq_len:
210+
end_i = next_start_i - 1
211+
period = {"start_dt": datetimes[start_i], "end_dt": datetimes[end_i]}
212+
periods.append(period)
213+
start_i = next_start_i
214+
215+
assert len(periods) > 0
216+
217+
return pd.DataFrame(periods)
218+
219+
168220
def get_t0_datetimes(
169221
datetimes: pd.DatetimeIndex,
170222
total_seq_len: int,

0 commit comments

Comments
 (0)