|
1284 | 1284 | " ]\n",
|
1285 | 1285 | " cols_order = first_out_cols + remaining_cols + [target_col]\n",
|
1286 | 1286 | " return ufp.sort(out[cols_order], by=[id_col, 'cutoff', time_col])\n",
|
1287 |
| - "\n", |
| 1287 | + " \n", |
1288 | 1288 | " def predict_insample(self, step_size: int = 1):\n",
|
1289 | 1289 | " \"\"\"Predict insample with core.NeuralForecast.\n",
|
1290 | 1290 | "\n",
|
|
1307 | 1307 | " for model in self.models:\n",
|
1308 | 1308 | " if model.SAMPLING_TYPE == 'recurrent':\n",
|
1309 | 1309 | " warnings.warn(f'Predict insample might not provide accurate predictions for \\\n",
|
1310 |
| - " recurrent model {repr(model)} class yet due to scaling.')\n", |
| 1310 | + " recurrent model {repr(model)} class yet due to scaling.')\n", |
1311 | 1311 | " print(f'WARNING: Predict insample might not provide accurate predictions for \\\n",
|
1312 |
| - " recurrent model {repr(model)} class yet due to scaling.')\n", |
1313 |
| - " \n", |
1314 |
| - " cols = []\n", |
1315 |
| - " count_names = {'model': 0}\n", |
1316 |
| - " for model in self.models:\n", |
1317 |
| - " model_name = repr(model)\n", |
1318 |
| - " count_names[model_name] = count_names.get(model_name, -1) + 1\n", |
1319 |
| - " if count_names[model_name] > 0:\n", |
1320 |
| - " model_name += str(count_names[model_name])\n", |
1321 |
| - " cols += [model_name + n for n in model.loss.output_names]\n", |
| 1312 | + " recurrent model {repr(model)} class yet due to scaling.')\n", |
1322 | 1313 | "\n",
|
1323 |
| - " # Remove test set from dataset and last dates\n", |
1324 | 1314 | " test_size = self.models[0].get_test_size()\n",
|
1325 |
| - "\n", |
1326 |
| - " # trim the forefront period to ensure `test_size - h` should be module `step_size\n", |
1327 |
| - " # Note: current constraint imposes that all series lengths are equal, so we can take the first series length as sample\n", |
1328 |
| - " series_length = self.dataset.indptr[1] - self.dataset.indptr[0]\n", |
1329 |
| - " _, forefront_offset = np.divmod((series_length - test_size - self.h), step_size)\n", |
1330 |
| - "\n", |
1331 |
| - " if test_size>0 or forefront_offset>0:\n", |
1332 |
| - " trimmed_dataset = TimeSeriesDataset.trim_dataset(dataset=self.dataset,\n", |
1333 |
| - " right_trim=test_size,\n", |
1334 |
| - " left_trim=forefront_offset)\n", |
1335 |
| - " new_idxs = np.hstack(\n", |
1336 |
| - " [\n", |
1337 |
| - " np.arange(self.dataset.indptr[i] + forefront_offset, self.dataset.indptr[i + 1] - test_size)\n", |
1338 |
| - " for i in range(self.dataset.n_groups)\n", |
1339 |
| - " ]\n", |
| 1315 | + " \n", |
| 1316 | + " # Process each series separately\n", |
| 1317 | + " fcsts_dfs = []\n", |
| 1318 | + " trimmed_datasets = []\n", |
| 1319 | + " \n", |
| 1320 | + " for i in range(self.dataset.n_groups):\n", |
| 1321 | + " # Calculate series-specific length and offset\n", |
| 1322 | + " series_length = self.dataset.indptr[i + 1] - self.dataset.indptr[i]\n", |
| 1323 | + " _, forefront_offset = np.divmod((series_length - test_size - self.h), step_size)\n", |
| 1324 | + " \n", |
| 1325 | + " if test_size > 0 or forefront_offset > 0:\n", |
| 1326 | + " # Create single-series dataset\n", |
| 1327 | + " series_dataset = TimeSeriesDataset(\n", |
| 1328 | + " temporal=self.dataset.temporal[self.dataset.indptr[i]:self.dataset.indptr[i + 1]],\n", |
| 1329 | + " temporal_cols=self.dataset.temporal_cols,\n", |
| 1330 | + " indptr=np.array([0, series_length]),\n", |
| 1331 | + " y_idx=self.dataset.y_idx\n", |
| 1332 | + " )\n", |
| 1333 | + " \n", |
| 1334 | + " # Trim the series\n", |
| 1335 | + " trimmed_series = TimeSeriesDataset.trim_dataset(\n", |
| 1336 | + " dataset=series_dataset,\n", |
| 1337 | + " right_trim=test_size,\n", |
| 1338 | + " left_trim=forefront_offset\n", |
| 1339 | + " )\n", |
| 1340 | + " \n", |
| 1341 | + " new_idxs = np.arange(\n", |
| 1342 | + " self.dataset.indptr[i] + forefront_offset,\n", |
| 1343 | + " self.dataset.indptr[i + 1] - test_size\n", |
| 1344 | + " )\n", |
| 1345 | + " times = self.ds[new_idxs]\n", |
| 1346 | + " else:\n", |
| 1347 | + " trimmed_series = TimeSeriesDataset(\n", |
| 1348 | + " temporal=self.dataset.temporal[self.dataset.indptr[i]:self.dataset.indptr[i + 1]],\n", |
| 1349 | + " temporal_cols=self.dataset.temporal_cols,\n", |
| 1350 | + " indptr=np.array([0, series_length]),\n", |
| 1351 | + " y_idx=self.dataset.y_idx\n", |
| 1352 | + " )\n", |
| 1353 | + " times = self.ds[self.dataset.indptr[i]:self.dataset.indptr[i + 1]]\n", |
| 1354 | + " \n", |
| 1355 | + " series_fcsts_df = _insample_times(\n", |
| 1356 | + " times=times,\n", |
| 1357 | + " uids=self.uids[i:i+1],\n", |
| 1358 | + " indptr=trimmed_series.indptr,\n", |
| 1359 | + " h=self.h,\n", |
| 1360 | + " freq=self.freq,\n", |
| 1361 | + " step_size=step_size,\n", |
| 1362 | + " id_col=self.id_col,\n", |
| 1363 | + " time_col=self.time_col,\n", |
1340 | 1364 | " )\n",
|
1341 |
| - " times = self.ds[new_idxs]\n", |
1342 |
| - " else:\n", |
1343 |
| - " trimmed_dataset = self.dataset\n", |
1344 |
| - " times = self.ds\n", |
1345 |
| - "\n", |
1346 |
| - " # Generate dates\n", |
1347 |
| - " fcsts_df = _insample_times(\n", |
1348 |
| - " times=times,\n", |
1349 |
| - " uids=self.uids,\n", |
1350 |
| - " indptr=trimmed_dataset.indptr,\n", |
1351 |
| - " h=self.h,\n", |
1352 |
| - " freq=self.freq,\n", |
1353 |
| - " step_size=step_size,\n", |
1354 |
| - " id_col=self.id_col,\n", |
1355 |
| - " time_col=self.time_col,\n", |
1356 |
| - " )\n", |
1357 |
| - "\n", |
1358 |
| - " col_idx = 0\n", |
1359 |
| - " fcsts = np.full((len(fcsts_df), len(cols)), np.nan, dtype=np.float32)\n", |
| 1365 | + " \n", |
| 1366 | + " fcsts_dfs.append(series_fcsts_df)\n", |
| 1367 | + " trimmed_datasets.append(trimmed_series)\n", |
1360 | 1368 | "\n",
|
| 1369 | + " # Combine all series forecasts DataFrames\n", |
| 1370 | + " fcsts_df = ufp.vertical_concat(fcsts_dfs)\n", |
| 1371 | + " \n", |
| 1372 | + " # Generate predictions for each model\n", |
| 1373 | + " fcsts_list = []\n", |
1361 | 1374 | " for model in self.models:\n",
|
1362 |
| - " # Test size is the number of periods to forecast (full size of trimmed dataset)\n", |
1363 |
| - " model.set_test_size(test_size=trimmed_dataset.max_size)\n", |
1364 |
| - "\n", |
1365 |
| - " # Predict\n", |
1366 |
| - " model_fcsts = model.predict(trimmed_dataset, step_size=step_size)\n", |
1367 |
| - " # Append predictions in memory placeholder\n", |
1368 |
| - " output_length = len(model.loss.output_names)\n", |
1369 |
| - " fcsts[:,col_idx:(col_idx + output_length)] = model_fcsts\n", |
1370 |
| - " col_idx += output_length \n", |
1371 |
| - " model.set_test_size(test_size=test_size) # Set original test_size\n", |
1372 |
| - "\n", |
1373 |
| - " # original y\n", |
| 1375 | + " model_series_preds = []\n", |
| 1376 | + " for i, trimmed_dataset in enumerate(trimmed_datasets):\n", |
| 1377 | + " # Set test size to current series length\n", |
| 1378 | + " model.set_test_size(test_size=trimmed_dataset.max_size)\n", |
| 1379 | + " # Generate predictions\n", |
| 1380 | + " model_fcsts = model.predict(trimmed_dataset, step_size=step_size)\n", |
| 1381 | + " # Handle distributional forecasts; take only median\n", |
| 1382 | + " if len(model_fcsts.shape) > 1 and model_fcsts.shape[1] == 3:\n", |
| 1383 | + " model_fcsts = model_fcsts[:, 0] # Take first column (median)\n", |
| 1384 | + " # Ensure consistent 2D shape\n", |
| 1385 | + " if len(model_fcsts.shape) == 1:\n", |
| 1386 | + " model_fcsts = model_fcsts.reshape(-1, 1)\n", |
| 1387 | + " model_series_preds.append(model_fcsts)\n", |
| 1388 | + " model_preds = np.concatenate(model_series_preds, axis=0)\n", |
| 1389 | + " fcsts_list.append(model_preds)\n", |
| 1390 | + " # Reset test size to original\n", |
| 1391 | + " model.set_test_size(test_size=test_size)\n", |
| 1392 | + " \n", |
| 1393 | + " # Combine all predictions\n", |
| 1394 | + " fcsts = np.hstack(fcsts_list)\n", |
| 1395 | + " \n", |
| 1396 | + " # Add original y values\n", |
1374 | 1397 | " original_y = {\n",
|
1375 | 1398 | " self.id_col: ufp.repeat(self.uids, np.diff(self.dataset.indptr)),\n",
|
1376 | 1399 | " self.time_col: self.ds,\n",
|
1377 | 1400 | " self.target_col: self.dataset.temporal[:, 0].numpy(),\n",
|
1378 | 1401 | " }\n",
|
1379 | 1402 | "\n",
|
1380 |
| - " # Add predictions to forecasts DataFrame\n", |
| 1403 | + " # Create forecasts DataFrame\n", |
| 1404 | + " cols = self._get_model_names()\n", |
| 1405 | + " selected_cols = [col for col in cols if not col.endswith(('-lo', '-hi')) and (not '-' in col or col.endswith('-median'))]\n", |
1381 | 1406 | " if isinstance(self.uids, pl_Series):\n",
|
1382 |
| - " fcsts = pl_DataFrame(dict(zip(cols, fcsts.T)))\n", |
| 1407 | + " fcsts = pl_DataFrame(dict(zip(selected_cols, fcsts.T)))\n", |
1383 | 1408 | " Y_df = pl_DataFrame(original_y)\n",
|
1384 | 1409 | " else:\n",
|
1385 |
| - " fcsts = pd.DataFrame(fcsts, columns=cols)\n", |
| 1410 | + " fcsts = pd.DataFrame(fcsts, columns=selected_cols)\n", |
1386 | 1411 | " Y_df = pd.DataFrame(original_y).reset_index(drop=True)\n",
|
1387 |
| - " fcsts_df = ufp.horizontal_concat([fcsts_df, fcsts])\n", |
1388 | 1412 | "\n",
|
1389 |
| - " # Add original input df's y to forecasts DataFrame\n", |
| 1413 | + " # Combine forecasts with dates\n", |
| 1414 | + " fcsts_df = ufp.horizontal_concat([fcsts_df, fcsts])\n", |
| 1415 | + " \n", |
| 1416 | + " # Add original values\n", |
1390 | 1417 | " fcsts_df = ufp.join(fcsts_df, Y_df, how='left', on=[self.id_col, self.time_col])\n",
|
| 1418 | + " \n", |
| 1419 | + " # Apply scaling if needed\n", |
1391 | 1420 | " if self.scalers_:\n",
|
1392 | 1421 | " sizes = ufp.counts_by_id(fcsts_df, self.id_col)['counts'].to_numpy()\n",
|
1393 | 1422 | " indptr = np.append(0, sizes.cumsum())\n",
|
1394 | 1423 | " invert_cols = cols + [self.target_col]\n",
|
1395 | 1424 | " fcsts_df[invert_cols] = self._scalers_target_inverse_transform(\n",
|
1396 | 1425 | " fcsts_df[invert_cols].to_numpy(),\n",
|
1397 | 1426 | " indptr\n",
|
1398 |
| - " ) \n", |
| 1427 | + " )\n", |
1399 | 1428 | " return fcsts_df\n",
|
1400 |
| - " \n", |
| 1429 | + "\n", |
1401 | 1430 | " # Save list of models with pytorch lightning save_checkpoint function\n",
|
1402 | 1431 | " def save(self, path: str, model_index: Optional[List]=None, save_dataset: bool=True, overwrite: bool=False):\n",
|
1403 | 1432 | " \"\"\"Save NeuralForecast core class.\n",
|
|
2079 | 2108 | "n_series = 2\n",
|
2080 | 2109 | "h = 12\n",
|
2081 | 2110 | "\n",
|
2082 |
| - "config = {'input_size': tune.choice([12, 24]), \n", |
2083 |
| - " 'hidden_size': 128,\n", |
2084 |
| - " 'max_steps': 1,\n", |
2085 |
| - " 'val_check_steps': 1,\n", |
2086 |
| - " 'step_size': 12}\n", |
2087 |
| - "\n", |
| 2111 | + "def get_expected_size(df, h, test_size, step_size):\n", |
| 2112 | + " expected_size = 0\n", |
| 2113 | + " uids = df['unique_id'].unique()\n", |
| 2114 | + " for uid in uids:\n", |
| 2115 | + " input_len = len(df[df['unique_id'] == uid])\n", |
| 2116 | + " expected_size += ((input_len - test_size - h) / step_size + 1)*h\n", |
| 2117 | + " return expected_size\n", |
| 2118 | + " \n", |
2088 | 2119 | "models = [\n",
|
2089 | 2120 | " NHITS(h=h, input_size=24, loss=MQLoss(level=[80]), max_steps=1, alias='NHITS', scaler_type=None),\n",
|
2090 |
| - " AutoMLP(h=12, config=config, cpus=1, num_samples=1),\n", |
2091 | 2121 | " RNN(h=h, input_size=-1, loss=MAE(), max_steps=1, alias='RNN', scaler_type=None),\n",
|
2092 | 2122 | " ]\n",
|
2093 | 2123 | "\n",
|
|
2096 | 2126 | "\n",
|
2097 | 2127 | "forecasts = nf.predict_insample(step_size=1)\n",
|
2098 | 2128 | "\n",
|
2099 |
| - "expected_size = n_series*((len(AirPassengersPanel_train)//n_series-test_size)-h+1)*h\n", |
| 2129 | + "expected_size = get_expected_size(AirPassengersPanel_train, h, test_size, step_size=1)\n", |
| 2130 | + "assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'" |
| 2131 | + ] |
| 2132 | + }, |
| 2133 | + { |
| 2134 | + "cell_type": "code", |
| 2135 | + "execution_count": null, |
| 2136 | + "id": "8d996a0f", |
| 2137 | + "metadata": {}, |
| 2138 | + "outputs": [], |
| 2139 | + "source": [ |
| 2140 | + "#| hide\n", |
| 2141 | + "# Test predict_insample (different lengths)\n", |
| 2142 | + "diff_len_df = generate_series(n_series=n_series, max_length=100)\n", |
| 2143 | + "\n", |
| 2144 | + "nf = NeuralForecast(models=models, freq='D')\n", |
| 2145 | + "cv = nf.cross_validation(df=diff_len_df, val_size=0, test_size=test_size, n_windows=None)\n", |
| 2146 | + "\n", |
| 2147 | + "forecasts = nf.predict_insample(step_size=1)\n", |
| 2148 | + "expected_size = get_expected_size(diff_len_df, h, test_size, step_size=1)\n", |
2100 | 2149 | "assert len(forecasts) == expected_size, f'Shape mismatch in predict_insample: {len(forecasts)=}, {expected_size=}'"
|
2101 | 2150 | ]
|
2102 | 2151 | },
|
|
2866 | 2915 | "source": [
|
2867 | 2916 | "#| hide\n",
|
2868 | 2917 | "#| polars\n",
|
2869 |
| - "models = [LSTM(h=12, input_size=24, max_steps=5, hist_exog_list=['zeros'], scaler_type='robust')]\n", |
| 2918 | + "models = [LSTM(h=12, input_size=24, max_steps=5, scaler_type='robust')]\n", |
2870 | 2919 | "\n",
|
2871 | 2920 | "# Pandas\n",
|
2872 | 2921 | "nf = NeuralForecast(models=models, freq='M')\n",
|
|
2940 | 2989 | " last_cutoff = train_end - test_size * pd.offsets.MonthEnd() - h * pd.offsets.MonthEnd()\n",
|
2941 | 2990 | " expected_cutoffs = np.flip(np.array([last_cutoff - step_size * i * pd.offsets.MonthEnd() for i in range(n_expected_cutoffs)]))\n",
|
2942 | 2991 | " pl_cutoffs = forecasts.filter(polars.col('uid') ==nf.uids[1]).select('cutoff').unique(maintain_order=True)\n",
|
2943 |
| - " actual_cutoffs = np.array([pd.Timestamp(x['cutoff']) for x in pl_cutoffs.rows(named=True)])\n", |
| 2992 | + " actual_cutoffs = np.sort(np.array([pd.Timestamp(x['cutoff']) for x in pl_cutoffs.rows(named=True)]))\n", |
2944 | 2993 | " np.testing.assert_array_equal(expected_cutoffs, actual_cutoffs, err_msg=f\"{step_size=},{expected_cutoffs=},{actual_cutoffs=}\")\n",
|
2945 | 2994 | "\n",
|
2946 | 2995 | " # check forecast-points count per series\n",
|
|
0 commit comments