Skip to content

Commit 939056c

Browse files
authored
FIX: Insample predictions with series of varying lengths (#1246)
1 parent 532988e commit 939056c

File tree

2 files changed

+222
-136
lines changed

2 files changed

+222
-136
lines changed

nbs/core.ipynb

Lines changed: 125 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,7 @@
12841284
" ]\n",
12851285
" cols_order = first_out_cols + remaining_cols + [target_col]\n",
12861286
" return ufp.sort(out[cols_order], by=[id_col, 'cutoff', time_col])\n",
1287-
"\n",
1287+
" \n",
12881288
" def predict_insample(self, step_size: int = 1):\n",
12891289
" \"\"\"Predict insample with core.NeuralForecast.\n",
12901290
"\n",
@@ -1307,97 +1307,126 @@
13071307
" for model in self.models:\n",
13081308
" if model.SAMPLING_TYPE == 'recurrent':\n",
13091309
" warnings.warn(f'Predict insample might not provide accurate predictions for \\\n",
1310-
" recurrent model {repr(model)} class yet due to scaling.')\n",
1310+
" recurrent model {repr(model)} class yet due to scaling.')\n",
13111311
" print(f'WARNING: Predict insample might not provide accurate predictions for \\\n",
1312-
" recurrent model {repr(model)} class yet due to scaling.')\n",
1313-
" \n",
1314-
" cols = []\n",
1315-
" count_names = {'model': 0}\n",
1316-
" for model in self.models:\n",
1317-
" model_name = repr(model)\n",
1318-
" count_names[model_name] = count_names.get(model_name, -1) + 1\n",
1319-
" if count_names[model_name] > 0:\n",
1320-
" model_name += str(count_names[model_name])\n",
1321-
" cols += [model_name + n for n in model.loss.output_names]\n",
1312+
" recurrent model {repr(model)} class yet due to scaling.')\n",
13221313
"\n",
1323-
" # Remove test set from dataset and last dates\n",
13241314
" test_size = self.models[0].get_test_size()\n",
1325-
"\n",
1326-
" # trim the forefront period to ensure `test_size - h` should be module `step_size\n",
1327-
" # Note: current constraint imposes that all series lengths are equal, so we can take the first series length as sample\n",
1328-
" series_length = self.dataset.indptr[1] - self.dataset.indptr[0]\n",
1329-
" _, forefront_offset = np.divmod((series_length - test_size - self.h), step_size)\n",
1330-
"\n",
1331-
" if test_size>0 or forefront_offset>0:\n",
1332-
" trimmed_dataset = TimeSeriesDataset.trim_dataset(dataset=self.dataset,\n",
1333-
" right_trim=test_size,\n",
1334-
" left_trim=forefront_offset)\n",
1335-
" new_idxs = np.hstack(\n",
1336-
" [\n",
1337-
" np.arange(self.dataset.indptr[i] + forefront_offset, self.dataset.indptr[i + 1] - test_size)\n",
1338-
" for i in range(self.dataset.n_groups)\n",
1339-
" ]\n",
1315+
" \n",
1316+
" # Process each series separately\n",
1317+
" fcsts_dfs = []\n",
1318+
" trimmed_datasets = []\n",
1319+
" \n",
1320+
" for i in range(self.dataset.n_groups):\n",
1321+
" # Calculate series-specific length and offset\n",
1322+
" series_length = self.dataset.indptr[i + 1] - self.dataset.indptr[i]\n",
1323+
" _, forefront_offset = np.divmod((series_length - test_size - self.h), step_size)\n",
1324+
" \n",
1325+
" if test_size > 0 or forefront_offset > 0:\n",
1326+
" # Create single-series dataset\n",
1327+
" series_dataset = TimeSeriesDataset(\n",
1328+
" temporal=self.dataset.temporal[self.dataset.indptr[i]:self.dataset.indptr[i + 1]],\n",
1329+
" temporal_cols=self.dataset.temporal_cols,\n",
1330+
" indptr=np.array([0, series_length]),\n",
1331+
" y_idx=self.dataset.y_idx\n",
1332+
" )\n",
1333+
" \n",
1334+
" # Trim the series\n",
1335+
" trimmed_series = TimeSeriesDataset.trim_dataset(\n",
1336+
" dataset=series_dataset,\n",
1337+
" right_trim=test_size,\n",
1338+
" left_trim=forefront_offset\n",
1339+
" )\n",
1340+
" \n",
1341+
" new_idxs = np.arange(\n",
1342+
" self.dataset.indptr[i] + forefront_offset,\n",
1343+
" self.dataset.indptr[i + 1] - test_size\n",
1344+
" )\n",
1345+
" times = self.ds[new_idxs]\n",
1346+
" else:\n",
1347+
" trimmed_series = TimeSeriesDataset(\n",
1348+
" temporal=self.dataset.temporal[self.dataset.indptr[i]:self.dataset.indptr[i + 1]],\n",
1349+
" temporal_cols=self.dataset.temporal_cols,\n",
1350+
" indptr=np.array([0, series_length]),\n",
1351+
" y_idx=self.dataset.y_idx\n",
1352+
" )\n",
1353+
" times = self.ds[self.dataset.indptr[i]:self.dataset.indptr[i + 1]]\n",
1354+
" \n",
1355+
" series_fcsts_df = _insample_times(\n",
1356+
" times=times,\n",
1357+
" uids=self.uids[i:i+1],\n",
1358+
" indptr=trimmed_series.indptr,\n",
1359+
" h=self.h,\n",
1360+
" freq=self.freq,\n",
1361+
" step_size=step_size,\n",
1362+
" id_col=self.id_col,\n",
1363+
" time_col=self.time_col,\n",
13401364
" )\n",
1341-
" times = self.ds[new_idxs]\n",
1342-
" else:\n",
1343-
" trimmed_dataset = self.dataset\n",
1344-
" times = self.ds\n",
1345-
"\n",
1346-
" # Generate dates\n",
1347-
" fcsts_df = _insample_times(\n",
1348-
" times=times,\n",
1349-
" uids=self.uids,\n",
1350-
" indptr=trimmed_dataset.indptr,\n",
1351-
" h=self.h,\n",
1352-
" freq=self.freq,\n",
1353-
" step_size=step_size,\n",
1354-
" id_col=self.id_col,\n",
1355-
" time_col=self.time_col,\n",
1356-
" )\n",
1357-
"\n",
1358-
" col_idx = 0\n",
1359-
" fcsts = np.full((len(fcsts_df), len(cols)), np.nan, dtype=np.float32)\n",
1365+
" \n",
1366+
" fcsts_dfs.append(series_fcsts_df)\n",
1367+
" trimmed_datasets.append(trimmed_series)\n",
13601368
"\n",
1369+
" # Combine all series forecasts DataFrames\n",
1370+
" fcsts_df = ufp.vertical_concat(fcsts_dfs)\n",
1371+
" \n",
1372+
" # Generate predictions for each model\n",
1373+
" fcsts_list = []\n",
13611374
" for model in self.models:\n",
1362-
" # Test size is the number of periods to forecast (full size of trimmed dataset)\n",
1363-
" model.set_test_size(test_size=trimmed_dataset.max_size)\n",
1364-
"\n",
1365-
" # Predict\n",
1366-
" model_fcsts = model.predict(trimmed_dataset, step_size=step_size)\n",
1367-
" # Append predictions in memory placeholder\n",
1368-
" output_length = len(model.loss.output_names)\n",
1369-
" fcsts[:,col_idx:(col_idx + output_length)] = model_fcsts\n",
1370-
" col_idx += output_length \n",
1371-
" model.set_test_size(test_size=test_size) # Set original test_size\n",
1372-
"\n",
1373-
" # original y\n",
1375+
" model_series_preds = []\n",
1376+
" for i, trimmed_dataset in enumerate(trimmed_datasets):\n",
1377+
" # Set test size to current series length\n",
1378+
" model.set_test_size(test_size=trimmed_dataset.max_size)\n",
1379+
" # Generate predictions\n",
1380+
" model_fcsts = model.predict(trimmed_dataset, step_size=step_size)\n",
1381+
" # Handle distributional forecasts; take only median\n",
1382+
" if len(model_fcsts.shape) > 1 and model_fcsts.shape[1] == 3:\n",
1383+
" model_fcsts = model_fcsts[:, 0] # Take first column (median)\n",
1384+
" # Ensure consistent 2D shape\n",
1385+
" if len(model_fcsts.shape) == 1:\n",
1386+
" model_fcsts = model_fcsts.reshape(-1, 1)\n",
1387+
" model_series_preds.append(model_fcsts)\n",
1388+
" model_preds = np.concatenate(model_series_preds, axis=0)\n",
1389+
" fcsts_list.append(model_preds)\n",
1390+
" # Reset test size to original\n",
1391+
" model.set_test_size(test_size=test_size)\n",
1392+
" \n",
1393+
" # Combine all predictions\n",
1394+
" fcsts = np.hstack(fcsts_list)\n",
1395+
" \n",
1396+
" # Add original y values\n",
13741397
" original_y = {\n",
13751398
" self.id_col: ufp.repeat(self.uids, np.diff(self.dataset.indptr)),\n",
13761399
" self.time_col: self.ds,\n",
13771400
" self.target_col: self.dataset.temporal[:, 0].numpy(),\n",
13781401
" }\n",
13791402
"\n",
1380-
" # Add predictions to forecasts DataFrame\n",
1403+
" # Create forecasts DataFrame\n",
1404+
" cols = self._get_model_names()\n",
1405+
" selected_cols = [col for col in cols if not col.endswith(('-lo', '-hi')) and (not '-' in col or col.endswith('-median'))]\n",
13811406
" if isinstance(self.uids, pl_Series):\n",
1382-
" fcsts = pl_DataFrame(dict(zip(cols, fcsts.T)))\n",
1407+
" fcsts = pl_DataFrame(dict(zip(selected_cols, fcsts.T)))\n",
13831408
" Y_df = pl_DataFrame(original_y)\n",
13841409
" else:\n",
1385-
" fcsts = pd.DataFrame(fcsts, columns=cols)\n",
1410+
" fcsts = pd.DataFrame(fcsts, columns=selected_cols)\n",
13861411
" Y_df = pd.DataFrame(original_y).reset_index(drop=True)\n",
1387-
" fcsts_df = ufp.horizontal_concat([fcsts_df, fcsts])\n",
13881412
"\n",
1389-
" # Add original input df's y to forecasts DataFrame\n",
1413+
" # Combine forecasts with dates\n",
1414+
" fcsts_df = ufp.horizontal_concat([fcsts_df, fcsts])\n",
1415+
" \n",
1416+
" # Add original values\n",
13901417
" fcsts_df = ufp.join(fcsts_df, Y_df, how='left', on=[self.id_col, self.time_col])\n",
1418+
" \n",
1419+
" # Apply scaling if needed\n",
13911420
" if self.scalers_:\n",
13921421
" sizes = ufp.counts_by_id(fcsts_df, self.id_col)['counts'].to_numpy()\n",
13931422
" indptr = np.append(0, sizes.cumsum())\n",
13941423
" invert_cols = cols + [self.target_col]\n",
13951424
" fcsts_df[invert_cols] = self._scalers_target_inverse_transform(\n",
13961425
" fcsts_df[invert_cols].to_numpy(),\n",
13971426
" indptr\n",
1398-
" ) \n",
1427+
" )\n",
13991428
" return fcsts_df\n",
1400-
" \n",
1429+
"\n",
14011430
" # Save list of models with pytorch lightning save_checkpoint function\n",
14021431
" def save(self, path: str, model_index: Optional[List]=None, save_dataset: bool=True, overwrite: bool=False):\n",
14031432
" \"\"\"Save NeuralForecast core class.\n",
@@ -2079,15 +2108,16 @@
20792108
"n_series = 2\n",
20802109
"h = 12\n",
20812110
"\n",
2082-
"config = {'input_size': tune.choice([12, 24]), \n",
2083-
" 'hidden_size': 128,\n",
2084-
" 'max_steps': 1,\n",
2085-
" 'val_check_steps': 1,\n",
2086-
" 'step_size': 12}\n",
2087-
"\n",
2111+
"def get_expected_size(df, h, test_size, step_size):\n",
2112+
" expected_size = 0\n",
2113+
" uids = df['unique_id'].unique()\n",
2114+
" for uid in uids:\n",
2115+
" input_len = len(df[df['unique_id'] == uid])\n",
2116+
" expected_size += ((input_len - test_size - h) / step_size + 1)*h\n",
2117+
" return expected_size\n",
2118+
" \n",
20882119
"models = [\n",
20892120
" NHITS(h=h, input_size=24, loss=MQLoss(level=[80]), max_steps=1, alias='NHITS', scaler_type=None),\n",
2090-
" AutoMLP(h=12, config=config, cpus=1, num_samples=1),\n",
20912121
" RNN(h=h, input_size=-1, loss=MAE(), max_steps=1, alias='RNN', scaler_type=None),\n",
20922122
" ]\n",
20932123
"\n",
@@ -2096,7 +2126,26 @@
20962126
"\n",
20972127
"forecasts = nf.predict_insample(step_size=1)\n",
20982128
"\n",
2099-
"expected_size = n_series*((len(AirPassengersPanel_train)//n_series-test_size)-h+1)*h\n",
2129+
"expected_size = get_expected_size(AirPassengersPanel_train, h, test_size, step_size=1)\n",
2130+
"assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'"
2131+
]
2132+
},
2133+
{
2134+
"cell_type": "code",
2135+
"execution_count": null,
2136+
"id": "8d996a0f",
2137+
"metadata": {},
2138+
"outputs": [],
2139+
"source": [
2140+
"#| hide\n",
2141+
"# Test predict_insample (different lengths)\n",
2142+
"diff_len_df = generate_series(n_series=n_series, max_length=100)\n",
2143+
"\n",
2144+
"nf = NeuralForecast(models=models, freq='D')\n",
2145+
"cv = nf.cross_validation(df=diff_len_df, val_size=0, test_size=test_size, n_windows=None)\n",
2146+
"\n",
2147+
"forecasts = nf.predict_insample(step_size=1)\n",
2148+
"expected_size = get_expected_size(diff_len_df, h, test_size, step_size=1)\n",
21002149
"assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'"
21012150
]
21022151
},
@@ -2866,7 +2915,7 @@
28662915
"source": [
28672916
"#| hide\n",
28682917
"#| polars\n",
2869-
"models = [LSTM(h=12, input_size=24, max_steps=5, hist_exog_list=['zeros'], scaler_type='robust')]\n",
2918+
"models = [LSTM(h=12, input_size=24, max_steps=5, scaler_type='robust')]\n",
28702919
"\n",
28712920
"# Pandas\n",
28722921
"nf = NeuralForecast(models=models, freq='M')\n",
@@ -2940,7 +2989,7 @@
29402989
" last_cutoff = train_end - test_size * pd.offsets.MonthEnd() - h * pd.offsets.MonthEnd()\n",
29412990
" expected_cutoffs = np.flip(np.array([last_cutoff - step_size * i * pd.offsets.MonthEnd() for i in range(n_expected_cutoffs)]))\n",
29422991
" pl_cutoffs = forecasts.filter(polars.col('uid') ==nf.uids[1]).select('cutoff').unique(maintain_order=True)\n",
2943-
" actual_cutoffs = np.array([pd.Timestamp(x['cutoff']) for x in pl_cutoffs.rows(named=True)])\n",
2992+
" actual_cutoffs = np.sort(np.array([pd.Timestamp(x['cutoff']) for x in pl_cutoffs.rows(named=True)]))\n",
29442993
" np.testing.assert_array_equal(expected_cutoffs, actual_cutoffs, err_msg=f\"{step_size=},{expected_cutoffs=},{actual_cutoffs=}\")\n",
29452994
"\n",
29462995
" # check forecast-points count per series\n",

0 commit comments

Comments
 (0)