Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Big new design Part 2 :) #307

Merged
merged 39 commits into from
Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b102da8
Making a start on the big new design! Sketched out the basic design i…
JackKelly Oct 28, 2021
63f0a2a
Implement arg_logger decorator
JackKelly Oct 28, 2021
663852d
enable load_solar_pv_data to load from any compute environment. Fixe…
JackKelly Oct 28, 2021
61be554
Successfully gets t0 datetimes
JackKelly Oct 28, 2021
ff18699
fix incorrect logger message
JackKelly Oct 28, 2021
8d5043b
successfully checks for CSV file
JackKelly Oct 28, 2021
8bef05c
Check there is no overlap between split datetimes. Fixes #299
JackKelly Oct 28, 2021
4d28923
Successfully creates directories and spatial_and_temporal_locations_o…
JackKelly Oct 28, 2021
33318b3
tidy up check_directories
JackKelly Oct 28, 2021
856fe64
Fix merge conflicts with main
JackKelly Oct 29, 2021
af0b8f7
implement Manager._get_first_batches_to_create()
JackKelly Oct 29, 2021
b2387f3
start fleshing out Manager.create_batches()
JackKelly Oct 29, 2021
04d4fbb
Finish first rough draft of Manager.create_batches()
JackKelly Oct 29, 2021
72e39c8
Finally, a full complete draft of #213. Not yet tested
JackKelly Oct 29, 2021
07db836
open DataSource
JackKelly Oct 29, 2021
7004973
Delete datamodule.py and datasets.py
JackKelly Oct 29, 2021
f896a5e
Remove n_timesteps_per_batch and _cache from DataSources.
JackKelly Oct 29, 2021
e3d1597
Implement get_filesystem()
JackKelly Oct 29, 2021
af6707a
prepare_ml_data.py runs and successfully creates GSP batches!
JackKelly Oct 29, 2021
71fdd78
implement check_input_paths_exist() in all DataSources
JackKelly Oct 29, 2021
01b364d
fixed about half the unittests
JackKelly Oct 29, 2021
bd57063
all tests pass except the test_data_source_list.py. Fixed some error…
JackKelly Nov 1, 2021
b1f54b1
All tests pass!
JackKelly Nov 1, 2021
38ecb01
fix linter errors
JackKelly Nov 1, 2021
f047423
more linter fixes
JackKelly Nov 1, 2021
bb1fecf
fix variable naming
JackKelly Nov 1, 2021
d1ef6ab
Update comments
JackKelly Nov 2, 2021
05b184c
update README
JackKelly Nov 2, 2021
b556a73
Convert get_maximum_batch_id() to use lexographical sorting. Fixes #308
JackKelly Nov 2, 2021
bb515dd
address reviewer comments
JackKelly Nov 2, 2021
0dfbe45
implement more reviewer suggestions
JackKelly Nov 2, 2021
9fbcfd2
addressing more reviewer comments
JackKelly Nov 2, 2021
eaae156
update docs
JackKelly Nov 2, 2021
c51fad5
remove pytorch lightning!
JackKelly Nov 2, 2021
75b4ce8
Fix bug: Create target directory if it does not exist
JackKelly Nov 2, 2021
e303d94
Update docstring
JackKelly Nov 2, 2021
cbaab07
Fix bug: Set first_batch_to_create to zero if the target directory do…
JackKelly Nov 2, 2021
51f3640
fix spelling mistake
JackKelly Nov 2, 2021
ce6e5ba
check columns names
JackKelly Nov 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,13 @@ There does not seem to be an automated way to do this selecting and downloading,
## Configure `nowcasting_dataset` to point to the downloaded data

Copy and modify one of the config yaml files in
[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config)
and modify `prepare_ml_data.py` to use your config file.
[`nowcasting_dataset/config/`](https://github.com/openclimatefix/nowcasting_dataset/tree/main/nowcasting_dataset/config).


## Prepare ML batches

Run [`scripts/prepare_ml_data.py`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
Run [`scripts/prepare_ml_data.py --help`](https://github.com/openclimatefix/nowcasting_dataset/blob/main/scripts/prepare_ml_data.py)
to learn how to run the `prepare_ml_data.py` script.


## What exactly is in each batch?
Expand Down
17 changes: 8 additions & 9 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
register_xr_data_set_to_tensor()


def pytest_addoption(parser):
def pytest_addoption(parser): # noqa: D103
parser.addoption(
"--use_cloud_data",
action="store_true",
Expand All @@ -32,12 +32,12 @@ def pytest_addoption(parser):


@pytest.fixture
def use_cloud_data(request):
def use_cloud_data(request): # noqa: D103
return request.config.getoption("--use_cloud_data")


@pytest.fixture
def sat_filename(use_cloud_data: bool) -> Path:
def sat_filename(use_cloud_data: bool) -> Path: # noqa: D103
if use_cloud_data:
return consts.SAT_FILENAME
else:
Expand All @@ -47,24 +47,23 @@ def sat_filename(use_cloud_data: bool) -> Path:


@pytest.fixture
def sat_data_source(sat_filename: Path):
def sat_data_source(sat_filename: Path): # noqa: D103
return SatelliteDataSource(
image_size_pixels=pytest.IMAGE_SIZE_PIXELS,
zarr_path=sat_filename,
history_minutes=0,
forecast_minutes=5,
channels=("HRV",),
n_timesteps_per_batch=2,
)


@pytest.fixture
def general_data_source():
def general_data_source(): # noqa: D103
return MetadataDataSource(history_minutes=0, forecast_minutes=5, object_at_center="GSP")


@pytest.fixture
def gsp_data_source():
def gsp_data_source(): # noqa: D103
return GSPDataSource(
image_size_pixels=16,
meters_per_pixel=2000,
Expand All @@ -75,13 +74,13 @@ def gsp_data_source():


@pytest.fixture
def configuration():
def configuration(): # noqa: D103
filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml")
configuration = load_yaml_configuration(filename)

return configuration


@pytest.fixture
def test_data_folder():
def test_data_folder(): # noqa: D103
return os.path.join(os.path.dirname(nowcasting_dataset.__file__), "../tests/data")
3 changes: 0 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ dependencies:

# Machine learning
- pytorch::pytorch # explicitly specify pytorch channel to prevent conda from using conda-forge for pytorch, and hence installing the CPU-only version.
- pytorch-lightning

# PV & Geospatial
- pvlib
Expand All @@ -45,6 +44,4 @@ dependencies:
- pre-commit

- pip:
- neptune-client[pytorch-lightning]
- tilemapbase
- git+https://github.com/SheffieldSolar/PV_Live-API
2 changes: 1 addition & 1 deletion notebooks/2021-09/2021-09-07/sat_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Notebook"""
from datetime import datetime

from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
Expand All @@ -9,7 +10,6 @@
forecast_len=12,
image_size_pixels=64,
meters_per_pixel=2000,
n_timesteps_per_batch=32,
)

s.open()
Expand Down
2 changes: 2 additions & 0 deletions nowcasting_dataset/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
""" Configuration of the dataset """
from nowcasting_dataset.config.load import load_yaml_configuration
from nowcasting_dataset.config.model import Configuration, InputData, set_git_commit
39 changes: 32 additions & 7 deletions nowcasting_dataset/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@
from typing import Optional

import git
import pandas as pd
from pathy import Pathy
from pydantic import BaseModel, Field, root_validator, validator

# nowcasting_dataset imports
from nowcasting_dataset.consts import (
DEFAULT_N_GSP_PER_EXAMPLE,
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
NWP_VARIABLE_NAMES,
SAT_VARIABLE_NAMES,
)

from nowcasting_dataset.dataset.split import split

IMAGE_SIZE_PIXELS_FIELD = Field(64, description="The number of pixels of the region of interest.")
METERS_PER_PIXEL_FIELD = Field(2000, description="The number of meters per pixel.")
Expand Down Expand Up @@ -102,7 +104,7 @@ class Satellite(DataSourceMixin):
"""Satellite configuration model"""

satellite_zarr_path: str = Field(
"gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr",
"gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr", # noqa: E501
description="The path which holds the satellite zarr.",
)
satellite_channels: tuple = Field(
Expand All @@ -116,7 +118,7 @@ class NWP(DataSourceMixin):
"""NWP configuration model"""

nwp_zarr_path: str = Field(
"gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr",
"gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr", # noqa: E501
description="The path which holds the NWP zarr.",
)
nwp_channels: tuple = Field(NWP_VARIABLE_NAMES, description="the channels used in the nwp data")
Expand Down Expand Up @@ -213,7 +215,8 @@ def set_forecast_and_history_minutes(cls, values):
Run through the different data sources and if the forecast or history minutes are not set,
then set them to the default values
"""

# It would be much better to use nowcasting_dataset.data_sources.ALL_DATA_SOURCE_NAMES,
# but that causes a circular import.
ALL_DATA_SOURCE_NAMES = ("pv", "satellite", "nwp", "gsp", "topographic", "sun")
enabled_data_sources = [
data_source_name
Expand Down Expand Up @@ -249,8 +252,8 @@ def set_all_to_defaults(cls):
class OutputData(BaseModel):
"""Output data model"""

filepath: str = Field(
"gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/",
filepath: Pathy = Field(
Pathy("gs://solar-pv-nowcasting-data/prepared_ML_training_data/v7/"),
description=(
"Where the data is saved to. If this is running on the cloud then should include"
" 'gs://' or 's3://'"
Expand All @@ -262,7 +265,29 @@ class Process(BaseModel):
"""Pydantic model of how the data is processed"""

seed: int = Field(1234, description="Random seed, so experiments can be repeatable")
batch_size: int = Field(32, description="the number of examples per batch")
batch_size: int = Field(32, description="The number of examples per batch")
t0_datetime_frequency: pd.Timedelta = Field(
pd.Timedelta("5 minutes"),
description=(
"The temporal frequency at which t0 datetimes will be sampled."
" Can be any string that `pandas.Timedelta()` understands."
" For example, if this is set to '5 minutes', then, for each example, the t0 datetime"
" could be at 0, 5, ..., 55 minutes past the hour. If there are DataSources with a"
" lower sample rate (e.g. half-hourly) then these lower-sample-rate DataSources will"
" still produce valid examples. For example, if a half-hourly DataSource is asked for"
" an example with t0=12:05, history_minutes=60, forecast_minutes=60, then it will"
" return data at 11:30, 12:00, 12:30, and 13:00."
),
)
split_method: split.SplitMethod = Field(
split.SplitMethod.DAY,
description=(
"The method used to split the t0 datetimes into train, validation and test sets."
),
)
n_train_batches: int = 250
n_validation_batches: int = 10
n_test_batches: int = 10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wonder wheather the defaults should be slightly larger.

Might be good to add a Field description too, I know its fairly obvious from the name, but just to help in the future

upload_every_n_batches: int = Field(
16,
description=(
Expand Down
2 changes: 1 addition & 1 deletion nowcasting_dataset/config/on_premises.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ input_data:
topographic_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/Topographic/europe_dem_1km_osgb.tif

output_data:
filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v8/
filepath: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/prepared_ML_training_data/v_testing/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally this would be v9 (or v11), but perhaps this is fine for now. A todo could be made to change this in the future

process:
batch_size: 32
seed: 1234
Expand Down
6 changes: 6 additions & 0 deletions nowcasting_dataset/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,9 @@
TOPOGRAPHIC_X_COORDS,
] + list(DATETIME_FEATURE_NAMES)
T0_DT = "t0_dt"


SPATIAL_AND_TEMPORAL_LOCATIONS_OF_EACH_EXAMPLE_FILENAME = (
"spatial_and_temporal_locations_of_each_example.csv"
)
SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES = ("t0_datetime_UTC", "x_center_OSGB", "y_center_OSGB")
18 changes: 15 additions & 3 deletions nowcasting_dataset/data_sources/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
""" Various DataSources """
from nowcasting_dataset.data_sources.data_source import DataSource
from nowcasting_dataset.data_sources.datetime.datetime_data_source import DatetimeDataSource
from nowcasting_dataset.data_sources.data_source import DataSource # noqa: F401
from nowcasting_dataset.data_sources.datetime.datetime_data_source import ( # noqa: F401
DatetimeDataSource,
)
from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
from nowcasting_dataset.data_sources.nwp.nwp_data_source import NWPDataSource
from nowcasting_dataset.data_sources.pv.pv_data_source import PVDataSource
from nowcasting_dataset.data_sources.satellite.satellite_data_source import SatelliteDataSource
from nowcasting_dataset.data_sources.gsp.gsp_data_source import GSPDataSource
from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource
from nowcasting_dataset.data_sources.topographic.topographic_data_source import (
TopographicDataSource,
)

MAP_DATA_SOURCE_NAME_TO_CLASS = {
"pv": PVDataSource,
"satellite": SatelliteDataSource,
"nwp": NWPDataSource,
"gsp": GSPDataSource,
"topographic": TopographicDataSource,
"sun": SunDataSource,
}
ALL_DATA_SOURCE_NAMES = tuple(MAP_DATA_SOURCE_NAME_TO_CLASS.keys())
Loading