Merge pull request #714 from openclimatefix/issue/update-use-pv-live

peterdudfield · web-flow · commit d9d83fd0866e · 2022-08-01T15:41:17.000+01:00
get gsp list from pvlive, not url
diff --git a/nowcasting_dataset/data_sources/gsp/eso.py b/nowcasting_dataset/data_sources/gsp/eso.py
@@ -8,22 +8,19 @@
 
 get_gsp_metadata_from_eso: gets the gsp metadata
 get_gsp_shape_from_eso: gets the shape of the gsp regions
-get_list_of_gsp_ids: gets a list of gsp_ids, by using 'get_gsp_metadata_from_eso'
 
 Peter Dudfield
 2021-09-13
 """
 
-import json
 import logging
 import os
-import urllib
-from typing import List, Optional
 from urllib.request import urlopen
 
 import geopandas as gpd
 import pandas as pd
 
+from nowcasting_dataset.data_sources.gsp.pvlive import get_list_of_gsp_ids
 from nowcasting_dataset.geospatial import osgb_to_lat_lon
 
 logger = logging.getLogger(__name__)
@@ -70,13 +67,7 @@ def get_gsp_metadata_from_eso(
         logger.debug("loading local file for ESO metadata:done")
     else:
         # we now get this from pvlive
-        url = "https://api0.solar.sheffield.ac.uk/pvlive/api/v4/gsp_list"
-        # TODO need to replace this, but not quite sure what it will be for the moment.
-        with urllib.request.urlopen(url) as fileobj:
-            d = json.loads(fileobj.read())
-
-        # make dataframe
-        metadata = pd.DataFrame(data=d["data"], columns=d["meta"])
+        metadata = get_list_of_gsp_ids(return_dataframe=True, return_national=False)
 
         # drop duplicates
         metadata = metadata.drop_duplicates(subset=["gsp_id"])
@@ -212,36 +203,3 @@ def get_gsp_shape_from_eso(
         shape_gpd["RegionID"] = range(1, len(shape_gpd) + 1)
 
     return shape_gpd
-
-
-def get_list_of_gsp_ids(maximum_number_of_gsp: Optional[int] = None) -> List[int]:
-    """
-    Get list of gsp ids from ESO metadata
-
-    Args:
-        maximum_number_of_gsp: Truncate list of GSPs to be no larger than this number of GSPs.
-            Set to None to disable truncation.
-
-    Returns:  list of gsp ids
-
-    """
-    # get a lit of gsp ids
-    metadata = get_gsp_metadata_from_eso(calculate_centroid=False)
-
-    # get rid of nans, and duplicates
-    metadata = metadata[~metadata["gsp_id"].isna()]
-    metadata.drop_duplicates(subset=["gsp_id"], inplace=True)
-
-    # make into list
-    gsp_ids = metadata["gsp_id"].to_list()
-    gsp_ids = [int(gsp_id) for gsp_id in gsp_ids]
-
-    # adjust number of gsp_ids
-    if maximum_number_of_gsp is None:
-        maximum_number_of_gsp = len(metadata)
-    if maximum_number_of_gsp > len(metadata):
-        logger.warning(f"Only {len(metadata)} gsp available to load")
-    if maximum_number_of_gsp < len(metadata):
-        gsp_ids = gsp_ids[0:maximum_number_of_gsp]
-
-    return gsp_ids
diff --git a/nowcasting_dataset/data_sources/gsp/pvlive.py b/nowcasting_dataset/data_sources/gsp/pvlive.py
@@ -2,20 +2,57 @@
 import logging
 from concurrent import futures
 from datetime import datetime, timedelta
-from typing import Optional
+from typing import List, Optional, Union
 
 import pandas as pd
 import pytz
 from pvlive_api import PVLive
 from tqdm import tqdm
 
-from nowcasting_dataset.data_sources.gsp.eso import get_list_of_gsp_ids
-
 logger = logging.getLogger(__name__)
 
 CHUNK_DURATION = timedelta(days=30)
 
 
+def get_list_of_gsp_ids(
+    maximum_number_of_gsp: Optional[int] = None,
+    return_dataframe: bool = False,
+    return_national: bool = True,
+) -> Union[List[int], pd.DataFrame]:
+    """
+    Get list of gsp ids from ESO metadata
+
+    Args:
+        maximum_number_of_gsp: Truncate list of GSPs to be no larger than this number of GSPs.
+            Set to None to disable truncation.
+        return_dataframe: Return as a dataframr with columns 'gsp_id', 'gsp_name', 'pes_id'
+        return_national: Return gsp_id=0 in ths data
+
+    Returns:  list of gsp ids
+
+    """
+
+    # setup pv Live class, although here we are getting historic data
+    pvl = PVLive()
+    gsp_ids = pvl.gsp_list
+
+    if not return_national:
+        gsp_ids = gsp_ids[gsp_ids["gsp_id"] != 0]
+
+    # adjust number of gsp_ids
+    if maximum_number_of_gsp is None:
+        maximum_number_of_gsp = len(gsp_ids)
+    if maximum_number_of_gsp > len(gsp_ids):
+        logger.warning(f"Only {len(gsp_ids)} gsp available to load")
+    if maximum_number_of_gsp < len(gsp_ids):
+        gsp_ids = gsp_ids[0:maximum_number_of_gsp]
+
+    if return_dataframe:
+        return gsp_ids
+    else:
+        return list(gsp_ids["gsp_id"])
+
+
 def load_pv_gsp_raw_data_from_pvlive(
     start: datetime, end: datetime, number_of_gsp: int = None, normalize_data: bool = True
 ) -> pd.DataFrame:
diff --git a/nowcasting_dataset/manager/manager.py b/nowcasting_dataset/manager/manager.py
@@ -82,11 +82,14 @@ def configure_loggers(
     def create_files_specifying_spatial_and_temporal_locations_of_each_example_if_necessary(
         self,
     ) -> None:
-        """Creates CSV files specifying the locations of each example if those files don't exist yet.
+        """Creates CSV files specifying the locations of each example
+
+         This only happens if those files don't exist yet.
 
         Creates one file per split, in this location:
 
-        `<output_data.filepath> / <split_name> / spatial_and_temporal_locations_of_each_example.csv`
+        `<output_data.filepath> / <split_name> /
+        spatial_and_temporal_locations_of_each_example.csv`
 
         Creates the output directory if it does not exist.
 
diff --git a/tests/data_sources/gsp/test_gsp_pvlive.py b/tests/data_sources/gsp/test_gsp_pvlive.py
@@ -6,10 +6,20 @@
 
 from nowcasting_dataset.data_sources.gsp.pvlive import (
     get_installed_capacity,
+    get_list_of_gsp_ids,
     load_pv_gsp_raw_data_from_pvlive,
 )
 
 
+def test_get_list_of_gsp_ids():
+    """Test get lis of gsp ids"""
+    gsp_id = get_list_of_gsp_ids(maximum_number_of_gsp=10)
+    assert len(gsp_id) == 10
+
+    gsp_id = get_list_of_gsp_ids()
+    assert len(gsp_id) == 318
+
+
 def test_load_gsp_raw_data_from_pvlive_one_gsp_one_day():
     """
     Test that one gsp system data can be loaded, just for one day
@@ -58,7 +68,7 @@ def test_load_gsp_raw_data_from_pvlive_one_gsp():
 
     assert isinstance(gsp_pv_df, pd.DataFrame)
     print(gsp_pv_df)
-    assert len(gsp_pv_df) == (48 * 30)
+    assert len(gsp_pv_df) == (48 * 30) + 1
     # 30 days in january,
     assert "datetime_gmt" in gsp_pv_df.columns
     assert "generation_mw" in gsp_pv_df.columns
@@ -89,4 +99,6 @@ def test_get_installed_capacity():
 
     assert len(installed_capacity) == 3
     assert "installedcapacity_mwp" == installed_capacity.name
-    assert installed_capacity.iloc[0] == 177.0772
+
+    # look at first GSP
+    assert installed_capacity.iloc[1] == 177.0772