|
1 | 1 | """ Time functions """
|
2 | 2 | import logging
|
3 | 3 | import warnings
|
4 |
| -from typing import Iterable, Tuple, List |
| 4 | +from typing import Iterable, Tuple, List, Dict |
5 | 5 |
|
6 | 6 | import numpy as np
|
7 | 7 | import pandas as pd
|
@@ -118,6 +118,7 @@ def intersection_of_2_dataframes_of_periods(a: pd.DataFrame, b: pd.DataFrame) ->
|
118 | 118 | return all_intersecting_periods.sort_values(by="start_dt").reset_index(drop=True)
|
119 | 119 |
|
120 | 120 |
|
| 121 | +# TODO: Delete this and its tests! |
121 | 122 | def get_start_datetimes(
|
122 | 123 | datetimes: pd.DatetimeIndex, total_seq_len: int, max_gap: pd.Timedelta = THIRTY_MINUTES
|
123 | 124 | ) -> pd.DatetimeIndex:
|
@@ -165,6 +166,57 @@ def get_start_datetimes(
|
165 | 166 | return pd.DatetimeIndex(np.concatenate(start_dt_index))
|
166 | 167 |
|
167 | 168 |
|
| 169 | +# TODO: Write test! |
| 170 | +def get_contiguous_time_periods( |
| 171 | + datetimes: pd.DatetimeIndex, min_seq_len: int, max_gap: pd.Timedelta = THIRTY_MINUTES |
| 172 | +) -> pd.DataFrame: |
| 173 | + """Returns a pd.DataFrame where each row records the boundary of a contiguous time periods. |
| 174 | +
|
| 175 | + Args: |
| 176 | + datetimes: The pd.DatetimeIndex of the timeseries. Must be sorted. |
| 177 | + min_seq_len: Sequences of min_seq_len or shorter will be discarded. |
| 178 | + max_gap: If any pair of consecutive `datetimes` is more than `max_gap` apart, then this pair |
| 179 | + of `datetimes` will be considered a "gap" between two contiguous sequences. |
| 180 | +
|
| 181 | + Returns: |
| 182 | + pd.DataFrame where each row represents a single time period. The pd.DataFrame |
| 183 | + has two columns: `start_dt` and `end_dt` (where 'dt' is short for 'datetime'). |
| 184 | + """ |
| 185 | + # Sanity checks. |
| 186 | + assert len(datetimes) > 0 |
| 187 | + assert min_seq_len > 1 |
| 188 | + assert datetimes.is_monotonic_increasing() |
| 189 | + assert datetimes.is_unique() |
| 190 | + |
| 191 | + # Find indices of gaps larger than max_gap: |
| 192 | + gap_mask = np.diff(datetimes) > max_gap |
| 193 | + gap_indices = np.argwhere(gap_mask)[:, 0] |
| 194 | + |
| 195 | + # gap_indicies are the indices into dt_index for the timestep immediately |
| 196 | + # *before* the gap. e.g. if the datetimes at 12:00, 12:05, 18:00, 18:05 |
| 197 | + # then gap_indicies will be [1]. So we add 1 to gap_indices to get |
| 198 | + # segment_boundaries, an index into dt_index which identifies the _start_ |
| 199 | + # of each segment. |
| 200 | + segment_boundaries = gap_indices + 1 |
| 201 | + |
| 202 | + # Capture the last segment of dt_index. |
| 203 | + segment_boundaries = np.concatenate((segment_boundaries, [len(datetimes)])) |
| 204 | + |
| 205 | + periods: List[Dict[str, pd.Timestamp]] = [] |
| 206 | + start_i = 0 |
| 207 | + for next_start_i in segment_boundaries: |
| 208 | + n_timesteps = next_start_i - start_i |
| 209 | + if n_timesteps > min_seq_len: |
| 210 | + end_i = next_start_i - 1 |
| 211 | + period = {"start_dt": datetimes[start_i], "end_dt": datetimes[end_i]} |
| 212 | + periods.append(period) |
| 213 | + start_i = next_start_i |
| 214 | + |
| 215 | + assert len(periods) > 0 |
| 216 | + |
| 217 | + return pd.DataFrame(periods) |
| 218 | + |
| 219 | + |
168 | 220 | def get_t0_datetimes(
|
169 | 221 | datetimes: pd.DatetimeIndex,
|
170 | 222 | total_seq_len: int,
|
|
0 commit comments