openclimatefix · JackKelly · Nov 2, 2021 · Oct 28, 2021 · Oct 28, 2021 · Oct 28, 2021
diff --git a/README.md b/README.md
@@ -113,13 +113,13 @@ There does not seem to be an automated way to do this selecting and downloading,
 ## Configure `nowcasting_dataset` to point to the downloaded data
 
 Copy and modify one of the config yaml files in
-[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config)
-and modify `prepare_ml_data.py` to use your config file.
+[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config).
 
 
 ## Prepare ML batches
 
-Run [`scripts/prepare_ml_data.py`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
+Run [`scripts/prepare_ml_data.py --help`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
+to learn how to run the `prepare_ml_data.py` script.
 
 
 ## What exactly is in each batch?

diff --git a/conftest.py b/conftest.py
@@ -22,7 +22,7 @@
 register_xr_data_set_to_tensor()
 
 
-def pytest_addoption(parser):
+def pytest_addoption(parser):  # noqa: D103
     parser.addoption(
         "--use_cloud_data",
         action="store_true",
@@ -32,12 +32,12 @@ def pytest_addoption(parser):
 
 
 @pytest.fixture
-def use_cloud_data(request):
+def use_cloud_data(request):  # noqa: D103
     return request.config.getoption("--use_cloud_data")
 
 
 @pytest.fixture
-def sat_filename(use_cloud_data: bool) -> Path:
+def sat_filename(use_cloud_data: bool) -> Path:  # noqa: D103
     if use_cloud_data:
         return consts.SAT_FILENAME
     else:
@@ -47,24 +47,23 @@ def sat_filename(use_cloud_data: bool) -> Path:
 
 
 @pytest.fixture
-def sat_data_source(sat_filename: Path):
+def sat_data_source(sat_filename: Path):  # noqa: D103
     return SatelliteDataSource(
         image_size_pixels=pytest.IMAGE_SIZE_PIXELS,
         zarr_path=sat_filename,
         history_minutes=0,
         forecast_minutes=5,
         channels=("HRV",),
-        n_timesteps_per_batch=2,
     )
 
 
 @pytest.fixture
-def general_data_source():
+def general_data_source():  # noqa: D103
     return MetadataDataSource(history_minutes=0, forecast_minutes=5, object_at_center="GSP")
 
 
 @pytest.fixture
-def gsp_data_source():
+def gsp_data_source():  # noqa: D103
     return GSPDataSource(
         image_size_pixels=16,
         meters_per_pixel=2000,
@@ -75,13 +74,13 @@ def gsp_data_source():
 
 
 @pytest.fixture
-def configuration():
+def configuration():  # noqa: D103
     filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml")
     configuration = load_yaml_configuration(filename)
 
     return configuration
 
 
 @pytest.fixture
-def test_data_folder():
+def test_data_folder():  # noqa: D103
     return os.path.join(os.path.dirname(nowcasting_dataset.__file__), "../tests/data")
diff --git a/environment.yml b/environment.yml
@@ -28,7 +28,6 @@ dependencies:
 
   # Machine learning
   - pytorch::pytorch # explicitly specify pytorch channel to prevent conda from using conda-forge for pytorch, and hence installing the CPU-only version.
-  - pytorch-lightning
 
   # PV & Geospatial
   - pvlib
@@ -45,6 +44,4 @@ dependencies:
   - pre-commit
 
   - pip:
-      - neptune-client[pytorch-lightning]
-      - tilemapbase
       - git+https://github.com/SheffieldSolar/PV_Live-API
diff --git a/notebooks/2021-09/2021-09-07/sat_data.py b/notebooks/2021-09/2021-09-07/sat_data.py
@@ -1,3 +1,4 @@
+"""Notebook"""
 from datetime import datetime
 
 from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
@@ -9,7 +10,6 @@
     forecast_len=12,
     image_size_pixels=64,
     meters_per_pixel=2000,
-    n_timesteps_per_batch=32,
 )
 
 s.open()

diff --git a/nowcasting_dataset/config/__init__.py b/nowcasting_dataset/config/__init__.py
@@ -1 +1,3 @@
 """ Configuration of the dataset """
+from nowcasting_dataset.config.load import load_yaml_configuration
+from nowcasting_dataset.config.model import Configuration, InputData, set_git_commit
diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py
@@ -15,16 +15,18 @@
 from typing import Optional
 
 import git
+import pandas as pd
 from pathy import Pathy
 from pydantic import BaseModel, Field, root_validator, validator
 
+# nowcasting_dataset imports
 from nowcasting_dataset.consts import (
     DEFAULT_N_GSP_PER_EXAMPLE,
     DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
     NWP_VARIABLE_NAMES,
     SAT_VARIABLE_NAMES,
 )
-
+from nowcasting_dataset.dataset.split import split
 
 IMAGE_SIZE_PIXELS_FIELD = Field(64, description="The number of pixels of the region of interest.")
 METERS_PER_PIXEL_FIELD = Field(2000, description="The number of meters per pixel.")
@@ -102,7 +104,7 @@ class Satellite(DataSourceMixin):
     """Satellite configuration model"""
 
     satellite_zarr_path: str = Field(
-        "gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr",
+        "gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr",  # noqa: E501
         description="The path which holds the satellite zarr.",
     )
     satellite_channels: tuple = Field(
@@ -116,7 +118,7 @@ class NWP(DataSourceMixin):
     """NWP configuration model"""
 
     nwp_zarr_path: str = Field(
-        "gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr",
+        "gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr",  # noqa: E501
         description="The path which holds the NWP zarr.",
     )
     nwp_channels: tuple = Field(NWP_VARIABLE_NAMES, description="the channels used in the nwp data")
@@ -213,7 +215,8 @@ def set_forecast_and_history_minutes(cls, values):
         Run through the different data sources and  if the forecast or history minutes are not set,
         then set them to the default values
         """
-
+        # It would be much better to use nowcasting_dataset.data_sources.ALL_DATA_SOURCE_NAMES,
+        # but that causes a circular import.
         ALL_DATA_SOURCE_NAMES = ("pv", "satellite", "nwp", "gsp", "topographic", "sun")
         enabled_data_sources = [
             data_source_name
@@ -249,8 +252,8 @@ def set_all_to_defaults(cls):
 class OutputData(BaseModel):
     """Output data model"""
 
-    filepath: str = Field(
-        "gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/",
+    filepath: Pathy = Field(
+        Pathy("gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/"),
         description=(
             "Where the data is saved to.  If this is running on the cloud then should include"
             " 'gs://' or 's3://'"
@@ -262,7 +265,29 @@ class Process(BaseModel):
     """Pydantic model of how the data is processed"""
 
     seed: int = Field(1234, description="Random seed, so experiments can be repeatable")
-    batch_size: int = Field(32, description="the number of examples per batch")
+    batch_size: int = Field(32, description="The number of examples per batch")
+    t0_datetime_frequency: pd.Timedelta = Field(
+        pd.Timedelta("5 minutes"),
+        description=(
+            "The temporal frequency at which t0 datetimes will be sampled."
+            "  Can be any string that `pandas.Timedelta()` understands."
+            "  For example, if this is set to '5 minutes', then, for each example, the t0 datetime"
+            " could be at 0, 5, ..., 55 minutes past the hour.  If there are DataSources with a"
+            " lower sample rate (e.g. half-hourly) then these lower-sample-rate DataSources will"
+            " still produce valid examples.  For example, if a half-hourly DataSource is asked for"
+            " an example with t0=12:05, history_minutes=60, forecast_minutes=60, then it will"
+            " return data at 11:30, 12:00, 12:30, and 13:00."
+        ),
+    )
+    split_method: split.SplitMethod = Field(
+        split.SplitMethod.DAY,
+        description=(
+            "The method used to split the t0 datetimes into train, validation and test sets."
+        ),
+    )
+    n_train_batches: int = 250
+    n_validation_batches: int = 10
+    n_test_batches: int = 10
     upload_every_n_batches: int = Field(
         16,
         description=(

diff --git a/nowcasting_dataset/config/on_premises.yaml b/nowcasting_dataset/config/on_premises.yaml
@@ -56,7 +56,7 @@ input_data:
     topographic_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/Topographic/europe_dem_1km_osgb.tif
 
 output_data:
-  filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v8/
+  filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v_testing/
 process:
   batch_size: 32
   seed: 1234

diff --git a/nowcasting_dataset/consts.py b/nowcasting_dataset/consts.py
@@ -102,3 +102,9 @@
     TOPOGRAPHIC_X_COORDS,
 ] + list(DATETIME_FEATURE_NAMES)
 T0_DT = "t0_dt"
+
+
+SPATIAL_AND_TEMPORAL_LOCATIONS_OF_EACH_EXAMPLE_FILENAME = (
+    "spatial_and_temporal_locations_of_each_example.csv"
+)
+SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES = ("t0_datetime_UTC", "x_center_OSGB", "y_center_OSGB")
diff --git a/nowcasting_dataset/data_sources/__init__.py b/nowcasting_dataset/data_sources/__init__.py
@@ -1,11 +1,23 @@
 """ Various DataSources """
-from nowcasting_dataset.data_sources.data_source import DataSource
-from nowcasting_dataset.data_sources.datetime.datetime_data_source import DatetimeDataSource
+from nowcasting_dataset.data_sources.data_source import DataSource  # noqa: F401
+from nowcasting_dataset.data_sources.datetime.datetime_data_source import (  # noqa: F401
+    DatetimeDataSource,
+)
+from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
 from nowcasting_dataset.data_sources.nwp.nwp_data_source import NWPDataSource
 from nowcasting_dataset.data_sources.pv.pv_data_source import PVDataSource
 from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
-from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
 from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource
 from nowcasting_dataset.data_sources.topographic.topographic_data_source import (
     TopographicDataSource,
 )
+
+MAP_DATA_SOURCE_NAME_TO_CLASS = {
+    "pv": PVDataSource,
+    "satellite": SatelliteDataSource,
+    "nwp": NWPDataSource,
+    "gsp": GSPDataSource,
+    "topographic": TopographicDataSource,
+    "sun": SunDataSource,
+}
+ALL_DATA_SOURCE_NAMES = tuple(MAP_DATA_SOURCE_NAME_TO_CLASS.keys())