add hf radar, national_platforms and slow but accurate glider metric

ioos · Apr 8, 2024 · d924b1f · d924b1f
1 parent d64412d
commit d924b1f
Show file tree

Hide file tree

Showing 8 changed files with 547 additions and 19 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,13 @@
+exclude: |
+    (?x)^(
+        GTS_Totals_weather_act.ipynb|
+        IOOS_BTN.ipynb|
+        btn_metrics.py|
+        gts_atn_metrics.py|
+        read_bufr.py|
+        website/.*
+    )$
+
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.5.0
@@ -8,12 +18,6 @@ repos:
     - id: end-of-file-fixer
     - id: check-added-large-files
 
-- repo: https://github.com/psf/black
-  rev: 24.3.0
-  hooks:
-  - id: black
-    language_version: python3
-
 - repo: https://github.com/codespell-project/codespell
   rev: v2.2.6
   hooks:
@@ -31,6 +35,22 @@ repos:
       args: ["--fix", "--show-fixes"]
     - id: ruff-format
 
+- repo: https://github.com/nbQA-dev/nbQA
+  rev: 1.8.5
+  hooks:
+    - id: nbqa-check-ast
+    - id: nbqa-black
+    - id: nbqa-ruff
+      args: [
+        --fix,
+        --config=ruff.toml,
+        ]
+
+- repo: https://github.com/bdice/nb-strip-paths
+  rev: v0.1.0
+  hooks:
+    - id: nb-strip-paths
+
 ci:
     autofix_commit_msg: |
         [pre-commit.ci] auto fixes from pre-commit.com hooks

diff --git a/environment.yml b/environment.yml
@@ -8,6 +8,8 @@ dependencies:
   - fake-useragent
   - fiscalyear
   - geopandas>=0.13.2
+  - gliderpy
+  - html5lib
   - joblib
   - lxml
   - pandas

diff --git a/ioos_metrics/ioos_metrics.py b/ioos_metrics/ioos_metrics.py
@@ -9,6 +9,16 @@
 import requests
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
+from gliderpy.fetchers import GliderDataFetcher
+
+from ioos_metrics.national_platforms import (
+    get_cbibs,
+    get_cdip,
+    get_coops,
+    get_ndbc,
+    get_nerrs,
+    get_oap,
+)
 
 logging.basicConfig(
     filename="metric.log",
@@ -52,11 +62,11 @@ def previous_metrics():
 
 def _compare_metrics(column, num) -> str:
     """Compares last stored metric against the new one and report if it is up, down, or the same."""
+    last_row = previous_metrics().iloc[-1]
     date = last_row["date_UTC"]
     if num is None:
         msg = f"[{date}] : {column} failed."
 
-    last_row = previous_metrics().iloc[-1]
     old = last_row[column]
     if old == num:
         msg = f"[{date}] : {column} equal {num} = {old}."
@@ -69,6 +79,7 @@ def _compare_metrics(column, num) -> str:
     return msg
 
 
+@functools.lru_cache(maxsize=128)
 def federal_partners():
     """ICOOS Act/COORA.
 
@@ -86,9 +97,15 @@ def federal_partners():
     return df_fed_partners.shape[0]
 
 
-def ngdac_gliders():
+@functools.lru_cache(maxsize=128)
+def ngdac_gliders_fast():
     """NGDAC Glider Days.
 
+    This version uses the AllDatasets entry to compute the glider days.
+    It will include NaNs and, b/c of that, will return an "overestimation" for that metric.
+    One could argue that this is correct b/c a glider day at sea, with or without data, is a glider day.
+    However, for an "accurate" glider day estimation that uses only where data is collected, use ngdac_gliders.
+
     Gliders monitor water currents, temperature, and conditions that reveal effects from storms,
     impacts on fisheries, and the quality of our water.
     This information creates a more complete picture of what is happening in the ocean,
@@ -135,6 +152,98 @@ def ngdac_gliders():
     return df.sum().days
 
 
+@functools.lru_cache(maxsize=128)
+def ngdac_gliders():
+    """Same as ngdac_gliders but loops over all datasets to return a more accurate estimate for this metric.
+    This approach is slower but can also compute more refined metrics and other variables, like glider profiles.
+
+    """
+
+    def _extra_info(info_df, attribute_name) -> str:
+        """Get 'Attribute Name' 'Value' metadata."""
+        return info_df.loc[info_df["Attribute Name"] == attribute_name]["Value"].squeeze()
+
+    def _metadata(info_df) -> dict:
+        """Build the metadata a specific dataset_id."""
+        return {
+            "wmo_id": _extra_info(info_df, attribute_name="wmo_id"),
+            "time_coverage_start": _extra_info(
+                info_df,
+                attribute_name="time_coverage_start",
+            ),
+            "time_coverage_end": _extra_info(
+                info_df,
+                attribute_name="time_coverage_end",
+            ),
+            "glider": dataset_id.split("-")[0],
+            "geospatial_lat_min": _extra_info(
+                info_df,
+                attribute_name="geospatial_lat_min",
+            ),
+            "geospatial_lat_max": _extra_info(
+                info_df,
+                attribute_name="geospatial_lat_max",
+            ),
+            "geospatial_lon_min": _extra_info(
+                info_df,
+                attribute_name="geospatial_lon_min",
+            ),
+            "geospatial_lon_max": _extra_info(
+                info_df,
+                attribute_name="geospatial_lon_max",
+            ),
+            "institution": _extra_info(info_df, attribute_name="institution"),
+            "sea_name": _extra_info(info_df, attribute_name="sea_name"),
+            "acknowledgment": _extra_info(info_df, attribute_name="acknowledgment"),
+        }
+
+    def _computed_metadata(dataset_id) -> dict:
+        """Download the minimum amount of data possible for the computed metadata,
+        We cannot get first and last b/c the profile_id is not a contiguous sequence.
+        """
+        glider_grab.fetcher.dataset_id = dataset_id
+        glider_grab.fetcher.variables = [
+            "profile_id",
+            "latitude",
+            "longitude",
+            "time",
+        ]
+        df = glider_grab.to_pandas()
+        df = df.sort_index()
+        days = df.index[-1].ceil("D") - df.index[0].floor("D")
+        return {
+            "deployment_lat": df["latitude"].iloc[0],
+            "deployment_lon": df["longitude"].iloc[0],
+            "num_profiles": len(df),
+            # Profiles are not unique! This will results in a smaller profile count.
+            # "num_profiles": len(set(df['profile_id']))
+            "days": days,
+        }
+
+    glider_grab = GliderDataFetcher()
+
+    df = glider_grab.query(
+        min_lat=-90.0,
+        max_lat=90.0,
+        min_lon=-180,
+        max_lon=180,
+        min_time="2000-01-01T00:00:00",
+        max_time="2023-12-31T23:59:59",
+    )
+
+    metadata = {}
+    for _, row in list(df.iterrows()):
+        dataset_id = row["Dataset ID"]
+        info_url = row["info_url"].replace("html", "csv")
+        info_df = pd.read_csv(info_url)
+        info = _metadata(info_df)
+        info.update(_computed_metadata(dataset_id=dataset_id))
+        metadata.update({dataset_id: info})
+    metadata = pd.DataFrame(metadata).T
+    return metadata["days"].sum().days
+
+
+@functools.lru_cache(maxsize=128)
 def comt():
     """The COMT serves as a conduit between the federal operational
     and research communities and allows sharing of numerical models,
@@ -164,6 +273,7 @@ def comt():
     return comt
 
 
+@functools.lru_cache(maxsize=128)
 def regional_associations():
     """Finds the current IOOS Regional Associations."""
     ras = 0
@@ -180,6 +290,7 @@ def regional_associations():
     return ras
 
 
+@functools.lru_cache(maxsize=128)
 def regional_platforms():
     """Regional platforms are calculated from the annual IOOS asset inventory submitted by each Regional Association.
     More information about the IOOS asset inventory can be found at https://github.com/ioos/ioos-asset-inventory.
@@ -194,6 +305,7 @@ def regional_platforms():
     return len(df.loc["rows"].iloc[0])
 
 
+@functools.lru_cache(maxsize=128)
 def atn_deployments():
     """See Deployments at https://portal.atn.ioos.us/#."""
     headers = {"Accept": "application/json"}
@@ -211,6 +323,7 @@ def atn_deployments():
     return atn
 
 
+@functools.lru_cache(maxsize=128)
 def ott_projects():
     """The IOOS Ocean Technology Transition project sponsors the transition of emerging marine observing technologies,
     for which there is an existing operational requirement
@@ -246,6 +359,7 @@ def ott_projects():
     return ott_projects
 
 
+@functools.lru_cache(maxsize=128)
 def qartod_manuals():
     """As of the last update there are twelve QARTOD manuals in-place for IOOS.
     These manuals establish authoritative QA/QC procedures for oceanographic data.
@@ -276,6 +390,7 @@ def qartod_manuals():
     return qartod
 
 
+@functools.lru_cache(maxsize=128)
 def ioos_core_variables():
     """The IOOS Core Variables are presented on
     [this website](https://www.iooc.us/task-teams/core-ioos-variables/).
@@ -295,6 +410,7 @@ def ioos_core_variables():
     return len(df.index.tolist())
 
 
+@functools.lru_cache(maxsize=128)
 def metadata_records():
     """These are the number of metadata records currently available through the
     [IOOS Catalog](https://data.ioos.us).
@@ -313,11 +429,13 @@ def metadata_records():
     return datasets["count"]
 
 
+@functools.lru_cache(maxsize=128)
 def ioos() -> int:
     """Represents the one IOOS Office."""
     return 1
 
 
+@functools.lru_cache(maxsize=128)
 def mbon_projects():
     """Living marine resources are essential to the health and recreational needs of billions of people,
     yet marine biodiversity and ecosystem processes remain major frontiers in ocean observing.
@@ -355,6 +473,7 @@ def mbon_projects():
     return mbon_projects
 
 
+@functools.lru_cache(maxsize=128)
 def hab_pilot_projects():
     """These are the National Harmful Algal Bloom Observing Network Pilot Project awards.
     Currently these were calculated from the
@@ -379,6 +498,24 @@ def hab_pilot_projects():
     return nhabon_projects + 1  # Gulf of Mexico project
 
 
+@functools.lru_cache(maxsize=128)
+def hf_radar_installations():
+    """The previous number of 181 included all locations where
+    a HFR station had ever been sighted as part of the IOOS National Network,
+    but doesn't appear to me to have accounted for temporary installations,
+    HFRs unfunded by IOOS operated by international partners,
+    or instances where an HFR being relocated from one site to another caused it to be double-counted.
+    Even the number 165 represents a "high water mark" for simultaneously operating HFRs,
+    since HFRs routinely are taken offline for periods of time,
+    for both planned preventative maintenance and in response to other exigent issues.
+
+    From http://hfrnet.ucsd.edu/sitediag/stationList.php
+
+    """
+    # This is a hardcoded number at the moment!
+    return 165
+
+
 def update_metrics(*, debug=False):
     """Load previous metrics and update the spreadsheet."""
     df = previous_metrics()
@@ -393,6 +530,7 @@ def update_metrics(*, debug=False):
         "COMT Projects": comt,
         "Federal Partners": federal_partners,
         "HAB Pilot Projects": hab_pilot_projects,
+        "HF Radar Stations": hf_radar_installations,
         "IOOS Core Variables": ioos_core_variables,
         "IOOS": ioos,
         "MBON Projects": mbon_projects,
@@ -403,6 +541,15 @@ def update_metrics(*, debug=False):
         "Regional Associations": regional_associations,
         "Regional Platforms": regional_platforms,
     }
+    # We do the national platforms separately b/c this one hits multiple services.
+    national_platforms_functions = [
+        get_cbibs,
+        get_cdip,
+        get_coops,
+        get_ndbc,
+        get_nerrs,
+        get_oap,
+    ]
 
     # We cannot write the log in parallel. When debugging we should run the queries in seral mode.
     if debug:
@@ -416,13 +563,22 @@ def update_metrics(*, debug=False):
             # Log status.
             message = _compare_metrics(column=column, num=num)
             logging.info(f"{message}")
+
+        national_platforms = [function() for function in national_platforms_functions]
+        new_row.update({"National Platforms": national_platforms})
     else:
         cpu_count = joblib.cpu_count()
         parallel = joblib.Parallel(n_jobs=cpu_count, return_as="generator")
+
         values = parallel(joblib.delayed(function)() for function in functions.values())
         columns = dict(zip(functions.keys(), values, strict=False))
         new_row.update(columns)
 
+        national_platforms = sum(
+            parallel(joblib.delayed(function)() for function in national_platforms_functions),
+        )
+        new_row.update({"National Platforms": national_platforms})
+
     new_row = pd.DataFrame.from_dict(data=new_row, orient="index").T
 
     # only update numbers if it's a new day