Skip to content

Commit

Permalink
add hf radar, national_platforms and slow but accurate glider metric
Browse files Browse the repository at this point in the history
  • Loading branch information
ocefpaf committed Apr 8, 2024
1 parent d64412d commit d924b1f
Show file tree
Hide file tree
Showing 8 changed files with 547 additions and 19 deletions.
32 changes: 26 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
exclude: |
(?x)^(
GTS_Totals_weather_act.ipynb|
IOOS_BTN.ipynb|
btn_metrics.py|
gts_atn_metrics.py|
read_bufr.py|
website/.*
)$
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
Expand All @@ -8,12 +18,6 @@ repos:
- id: end-of-file-fixer
- id: check-added-large-files

- repo: https://github.com/psf/black
rev: 24.3.0
hooks:
- id: black
language_version: python3

- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
hooks:
Expand All @@ -31,6 +35,22 @@ repos:
args: ["--fix", "--show-fixes"]
- id: ruff-format

- repo: https://github.com/nbQA-dev/nbQA
rev: 1.8.5
hooks:
- id: nbqa-check-ast
- id: nbqa-black
- id: nbqa-ruff
args: [
--fix,
--config=ruff.toml,
]

- repo: https://github.com/bdice/nb-strip-paths
rev: v0.1.0
hooks:
- id: nb-strip-paths

ci:
autofix_commit_msg: |
[pre-commit.ci] auto fixes from pre-commit.com hooks
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ dependencies:
- fake-useragent
- fiscalyear
- geopandas>=0.13.2
- gliderpy
- html5lib
- joblib
- lxml
- pandas
Expand Down
160 changes: 158 additions & 2 deletions ioos_metrics/ioos_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from gliderpy.fetchers import GliderDataFetcher

from ioos_metrics.national_platforms import (
get_cbibs,
get_cdip,
get_coops,
get_ndbc,
get_nerrs,
get_oap,
)

logging.basicConfig(
filename="metric.log",
Expand Down Expand Up @@ -52,11 +62,11 @@ def previous_metrics():

def _compare_metrics(column, num) -> str:
"""Compares last stored metric against the new one and report if it is up, down, or the same."""
last_row = previous_metrics().iloc[-1]
date = last_row["date_UTC"]
if num is None:
msg = f"[{date}] : {column} failed."

last_row = previous_metrics().iloc[-1]
old = last_row[column]
if old == num:
msg = f"[{date}] : {column} equal {num} = {old}."
Expand All @@ -69,6 +79,7 @@ def _compare_metrics(column, num) -> str:
return msg


@functools.lru_cache(maxsize=128)
def federal_partners():
"""ICOOS Act/COORA.
Expand All @@ -86,9 +97,15 @@ def federal_partners():
return df_fed_partners.shape[0]


def ngdac_gliders():
@functools.lru_cache(maxsize=128)
def ngdac_gliders_fast():
"""NGDAC Glider Days.
This version uses the AllDatasets entry to compute the glider days.
It will include NaNs and, b/c of that, will return an "overestimation" for that metric.
One could argue that this is correct b/c a glider day at sea, with or without data, is a glider day.
However, for an "accurate" glider day estimation that uses only where data is collected, use ngdac_gliders.
Gliders monitor water currents, temperature, and conditions that reveal effects from storms,
impacts on fisheries, and the quality of our water.
This information creates a more complete picture of what is happening in the ocean,
Expand Down Expand Up @@ -135,6 +152,98 @@ def ngdac_gliders():
return df.sum().days


@functools.lru_cache(maxsize=128)
def ngdac_gliders():
"""Same as ngdac_gliders but loops over all datasets to return a more accurate estimate for this metric.
This approach is slower but can also compute more refined metrics and other variables, like glider profiles.
"""

def _extra_info(info_df, attribute_name) -> str:
"""Get 'Attribute Name' 'Value' metadata."""
return info_df.loc[info_df["Attribute Name"] == attribute_name]["Value"].squeeze()

def _metadata(info_df) -> dict:
"""Build the metadata a specific dataset_id."""
return {
"wmo_id": _extra_info(info_df, attribute_name="wmo_id"),
"time_coverage_start": _extra_info(
info_df,
attribute_name="time_coverage_start",
),
"time_coverage_end": _extra_info(
info_df,
attribute_name="time_coverage_end",
),
"glider": dataset_id.split("-")[0],
"geospatial_lat_min": _extra_info(
info_df,
attribute_name="geospatial_lat_min",
),
"geospatial_lat_max": _extra_info(
info_df,
attribute_name="geospatial_lat_max",
),
"geospatial_lon_min": _extra_info(
info_df,
attribute_name="geospatial_lon_min",
),
"geospatial_lon_max": _extra_info(
info_df,
attribute_name="geospatial_lon_max",
),
"institution": _extra_info(info_df, attribute_name="institution"),
"sea_name": _extra_info(info_df, attribute_name="sea_name"),
"acknowledgment": _extra_info(info_df, attribute_name="acknowledgment"),
}

def _computed_metadata(dataset_id) -> dict:
"""Download the minimum amount of data possible for the computed metadata,
We cannot get first and last b/c the profile_id is not a contiguous sequence.
"""
glider_grab.fetcher.dataset_id = dataset_id
glider_grab.fetcher.variables = [
"profile_id",
"latitude",
"longitude",
"time",
]
df = glider_grab.to_pandas()
df = df.sort_index()
days = df.index[-1].ceil("D") - df.index[0].floor("D")
return {
"deployment_lat": df["latitude"].iloc[0],
"deployment_lon": df["longitude"].iloc[0],
"num_profiles": len(df),
# Profiles are not unique! This will results in a smaller profile count.
# "num_profiles": len(set(df['profile_id']))
"days": days,
}

glider_grab = GliderDataFetcher()

df = glider_grab.query(
min_lat=-90.0,
max_lat=90.0,
min_lon=-180,
max_lon=180,
min_time="2000-01-01T00:00:00",
max_time="2023-12-31T23:59:59",
)

metadata = {}
for _, row in list(df.iterrows()):
dataset_id = row["Dataset ID"]
info_url = row["info_url"].replace("html", "csv")
info_df = pd.read_csv(info_url)
info = _metadata(info_df)
info.update(_computed_metadata(dataset_id=dataset_id))
metadata.update({dataset_id: info})
metadata = pd.DataFrame(metadata).T
return metadata["days"].sum().days


@functools.lru_cache(maxsize=128)
def comt():
"""The COMT serves as a conduit between the federal operational
and research communities and allows sharing of numerical models,
Expand Down Expand Up @@ -164,6 +273,7 @@ def comt():
return comt


@functools.lru_cache(maxsize=128)
def regional_associations():
"""Finds the current IOOS Regional Associations."""
ras = 0
Expand All @@ -180,6 +290,7 @@ def regional_associations():
return ras


@functools.lru_cache(maxsize=128)
def regional_platforms():
"""Regional platforms are calculated from the annual IOOS asset inventory submitted by each Regional Association.
More information about the IOOS asset inventory can be found at https://github.com/ioos/ioos-asset-inventory.
Expand All @@ -194,6 +305,7 @@ def regional_platforms():
return len(df.loc["rows"].iloc[0])


@functools.lru_cache(maxsize=128)
def atn_deployments():
"""See Deployments at https://portal.atn.ioos.us/#."""
headers = {"Accept": "application/json"}
Expand All @@ -211,6 +323,7 @@ def atn_deployments():
return atn


@functools.lru_cache(maxsize=128)
def ott_projects():
"""The IOOS Ocean Technology Transition project sponsors the transition of emerging marine observing technologies,
for which there is an existing operational requirement
Expand Down Expand Up @@ -246,6 +359,7 @@ def ott_projects():
return ott_projects


@functools.lru_cache(maxsize=128)
def qartod_manuals():
"""As of the last update there are twelve QARTOD manuals in-place for IOOS.
These manuals establish authoritative QA/QC procedures for oceanographic data.
Expand Down Expand Up @@ -276,6 +390,7 @@ def qartod_manuals():
return qartod


@functools.lru_cache(maxsize=128)
def ioos_core_variables():
"""The IOOS Core Variables are presented on
[this website](https://www.iooc.us/task-teams/core-ioos-variables/).
Expand All @@ -295,6 +410,7 @@ def ioos_core_variables():
return len(df.index.tolist())


@functools.lru_cache(maxsize=128)
def metadata_records():
"""These are the number of metadata records currently available through the
[IOOS Catalog](https://data.ioos.us).
Expand All @@ -313,11 +429,13 @@ def metadata_records():
return datasets["count"]


@functools.lru_cache(maxsize=128)
def ioos() -> int:
"""Represents the one IOOS Office."""
return 1


@functools.lru_cache(maxsize=128)
def mbon_projects():
"""Living marine resources are essential to the health and recreational needs of billions of people,
yet marine biodiversity and ecosystem processes remain major frontiers in ocean observing.
Expand Down Expand Up @@ -355,6 +473,7 @@ def mbon_projects():
return mbon_projects


@functools.lru_cache(maxsize=128)
def hab_pilot_projects():
"""These are the National Harmful Algal Bloom Observing Network Pilot Project awards.
Currently these were calculated from the
Expand All @@ -379,6 +498,24 @@ def hab_pilot_projects():
return nhabon_projects + 1 # Gulf of Mexico project


@functools.lru_cache(maxsize=128)
def hf_radar_installations():
"""The previous number of 181 included all locations where
a HFR station had ever been sighted as part of the IOOS National Network,
but doesn't appear to me to have accounted for temporary installations,
HFRs unfunded by IOOS operated by international partners,
or instances where an HFR being relocated from one site to another caused it to be double-counted.
Even the number 165 represents a "high water mark" for simultaneously operating HFRs,
since HFRs routinely are taken offline for periods of time,
for both planned preventative maintenance and in response to other exigent issues.
From http://hfrnet.ucsd.edu/sitediag/stationList.php
"""
# This is a hardcoded number at the moment!
return 165


def update_metrics(*, debug=False):
"""Load previous metrics and update the spreadsheet."""
df = previous_metrics()
Expand All @@ -393,6 +530,7 @@ def update_metrics(*, debug=False):
"COMT Projects": comt,
"Federal Partners": federal_partners,
"HAB Pilot Projects": hab_pilot_projects,
"HF Radar Stations": hf_radar_installations,
"IOOS Core Variables": ioos_core_variables,
"IOOS": ioos,
"MBON Projects": mbon_projects,
Expand All @@ -403,6 +541,15 @@ def update_metrics(*, debug=False):
"Regional Associations": regional_associations,
"Regional Platforms": regional_platforms,
}
# We do the national platforms separately b/c this one hits multiple services.
national_platforms_functions = [
get_cbibs,
get_cdip,
get_coops,
get_ndbc,
get_nerrs,
get_oap,
]

# We cannot write the log in parallel. When debugging we should run the queries in seral mode.
if debug:
Expand All @@ -416,13 +563,22 @@ def update_metrics(*, debug=False):
# Log status.
message = _compare_metrics(column=column, num=num)
logging.info(f"{message}")

national_platforms = [function() for function in national_platforms_functions]
new_row.update({"National Platforms": national_platforms})
else:
cpu_count = joblib.cpu_count()
parallel = joblib.Parallel(n_jobs=cpu_count, return_as="generator")

values = parallel(joblib.delayed(function)() for function in functions.values())
columns = dict(zip(functions.keys(), values, strict=False))
new_row.update(columns)

national_platforms = sum(
parallel(joblib.delayed(function)() for function in national_platforms_functions),
)
new_row.update({"National Platforms": national_platforms})

new_row = pd.DataFrame.from_dict(data=new_row, orient="index").T

# only update numbers if it's a new day
Expand Down
Loading

0 comments on commit d924b1f

Please sign in to comment.