From b2accfb885b4a8ea7dce1ad2bfdff4d089ac31b1 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 10 Oct 2024 14:37:25 +0100 Subject: [PATCH] examples: using data to define your workflow --- .../examples/external-data-files/.validate | 32 +++++ .../examples/external-data-files/flow.cylc | 66 ++++++++++ .../examples/external-data-files/index.rst | 122 ++++++++++++++++++ .../lib/python/load_data.py | 12 ++ .../examples/external-data-files/stations.csv | 7 + .../external-data-files/stations.json | 44 +++++++ 6 files changed, 283 insertions(+) create mode 100755 cylc/flow/etc/examples/external-data-files/.validate create mode 100644 cylc/flow/etc/examples/external-data-files/flow.cylc create mode 100644 cylc/flow/etc/examples/external-data-files/index.rst create mode 100644 cylc/flow/etc/examples/external-data-files/lib/python/load_data.py create mode 100644 cylc/flow/etc/examples/external-data-files/stations.csv create mode 100644 cylc/flow/etc/examples/external-data-files/stations.json diff --git a/cylc/flow/etc/examples/external-data-files/.validate b/cylc/flow/etc/examples/external-data-files/.validate new file mode 100755 index 00000000000..2ea8bbb7bd9 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/.validate @@ -0,0 +1,32 @@ +#!/bin/bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set -eux + +ID="$(< /dev/urandom tr -dc A-Za-z | head -c6)" + +# run the workflow +cylc vip --check-circular --no-run-name --no-detach --workflow-name "$ID" + +# check the station:ID mapping came out as expected +grep 'fetch data for heathrow, WMO ID: 03772' "${HOME}/cylc-run/${ID}/log/job/20000101T0000Z/fetch_heathrow/NN/job.out" + +# lint +cylc lint "$ID" + +# clean up +cylc clean "$ID" diff --git a/cylc/flow/etc/examples/external-data-files/flow.cylc b/cylc/flow/etc/examples/external-data-files/flow.cylc new file mode 100644 index 00000000000..232448935d4 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/flow.cylc @@ -0,0 +1,66 @@ +#!Jinja2 + +[meta] + title = Weather Station Workflow + description = """ + This workflow demonstrates how to read in a data file for use in + defining your workflow. + + We have a file called "stations.json" which contains a list of weather + stations with some data for each. This workflow reads the + "stations.json" file and creates a family for each weather station + with an environment variable for each data field. + + You can load data in other formats too. Try changing "load_json" to + "load_csv" and "stations.json" to "stations.csv" for a CSV example. + """ + + +{# Import a Python function to load our data. #} +{% from "load_data" import load_json %} + +{# Load data from the specified file. #} +{% set stations = load_json('stations.json') %} + +{# Extract a list of station names from the data file. #} +{% set station_names = stations | map(attribute="name") | list %} + + +{# Provide Cylc with a list of weather stations. #} +[task parameters] + station = {{ station_names | join(', ') }} + + +[scheduling] + initial cycle point = 2000-01-01 + final cycle point = 2000-01-02 + [[graph]] + P1D = fetch => process => collate + + +[runtime] +{# Define a family for each weather station #} +{% for station in stations %} + [[STATION]] + [[[environment]]] + {# Turn the parameter into an environment variable #} + {# NB: Just to show how, we could also have used `station["name"]`. #} + name = %(station)s + {# Turn the data for this station into environment variables. #} + wmo = {{ station["wmo"] }} + alt = {{ station["alt"] }} + lat = {{ station["lat"] }} + lon = {{ station["lon"] }} +{% endfor %} + + # a task that gets data + [[fetch]] + inherit = STATION + script = echo "fetch data for $name, WMO ID: $wmo" + + [[process]] + inherit = STATION + script = echo "process data for $name, location: $lat,$lon" + + [[collate]] + script = "echo collate data for stations: {{ station_names }}" diff --git a/cylc/flow/etc/examples/external-data-files/index.rst b/cylc/flow/etc/examples/external-data-files/index.rst new file mode 100644 index 00000000000..e68b34cfc50 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/index.rst @@ -0,0 +1,122 @@ +Using Data To Define Your Workflow +================================== + +.. admonition:: Get a copy of this example + :class: hint + + .. code-block:: console + + $ cylc get-resources examples/external-data-files + +We often want to read in a dataset for use in defining our workflow. + +The :ref:`Cylc tutorial ` is an +example of this where we want one ``get_observations`` task for each of a list +of weather stations. Each weather station has a name (e.g. "heathrow") and an +ID (e.g. 3772). + +.. code-block:: cylc + + [runtime] + [[get_observations_heathrow]] + script = get-observations + [[[environment]]] + SITE_ID = 3772 + [[get_observations_camborne]] + script = get-observations + [[[environment]]] + SITE_ID = 3808 + [[get_observations_shetland]] + script = get-observations + [[[environment]]] + SITE_ID = 3005 + [[get_observations_aldergrove]] + script = get-observations + [[[environment]]] + SITE_ID = 3917 + +It can be inconvenient to write out the name and ID of each station in your +workflow like this, however, you may already have this information in a more +convenient format (i.e. a data file of some form). + +With Cylc, we can use :ref:`Jinja2 ` to read in a data file and use that data to +define your workflow. + + +The Approach +------------ + +This example has three components: + +1. A JSON file containing a list of weather stations along with all the data + associated with them. + + .. literalinclude:: stations.json + :language: json + :caption: stations.json + +2. A Python function that reads the JSON file. + + .. code-block:: python + :caption: lib/python/load_data.py + + import json + + + def load_json(filename): + with open(filename, 'r') as json_file: + return json.load(json_file) + + We put this Python code in the workflow's ``lib/python`` directory which + allows us to import it from within our workflow. + +3. A ``flow.cylc`` file that uses the Python function to load the + data file. + + We can import Python functions with Jinja2 using the following syntax: + + .. code-block:: + + {% from "load_data" import load_json %} + + For more information, see :ref:`jinja2.importing_python_modules`. + + + +The Workflow +------------ + +The three files are arranged like so: + +.. code-block:: none + :caption: File Structure + + |-- flow.cylc + |-- lib + | `-- python + | `-- load_data.py + `-- stations.json + +The ``flow.cylc`` file: + +* Imports the Python function. +* Uses it to load the data. +* Then uses the data to define the workflow. + +.. literalinclude:: flow.cylc + :language: ini + :caption: flow.cylc + + +Data Types +---------- + +We can load other types of data file too. This example also includes the same +data in CSV format along with a Python function to load CSV data. To try it +out, open the ``flow.cylc`` file and replace ``stations.json`` with +``stations.csv`` and ``load_json`` with ``load_csv``. + +Any Python code that you import using Jinja2 will be executed using the Python +environment that Cylc is running in. So if you want to import Python code that +isn't in the standard library, you may need to get your system administrator to +install this dependency into the Cylc environment for you. diff --git a/cylc/flow/etc/examples/external-data-files/lib/python/load_data.py b/cylc/flow/etc/examples/external-data-files/lib/python/load_data.py new file mode 100644 index 00000000000..7f9dd822884 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/lib/python/load_data.py @@ -0,0 +1,12 @@ +import csv +import json + + +def load_csv(filename): + with open(filename, 'r') as csv_file: + return list(csv.DictReader(csv_file)) + + +def load_json(filename): + with open(filename, 'r') as json_file: + return json.load(json_file) diff --git a/cylc/flow/etc/examples/external-data-files/stations.csv b/cylc/flow/etc/examples/external-data-files/stations.csv new file mode 100644 index 00000000000..81c13a53ad2 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/stations.csv @@ -0,0 +1,7 @@ +name,wmo,alt,lat,lon +camborne,03808,87,50.21841,-5.32753 +heathrow,03772,25,51.47922.-0.45061 +lerwick,03005,82,60.13893,-1.18491, +aldergrove,03917,63,54.66365,-6.22534, +exeter,03844,27,50.73717,-3.40579, +middle_wallop,03749,90,51.14987,-1.56994 diff --git a/cylc/flow/etc/examples/external-data-files/stations.json b/cylc/flow/etc/examples/external-data-files/stations.json new file mode 100644 index 00000000000..213d4282eb4 --- /dev/null +++ b/cylc/flow/etc/examples/external-data-files/stations.json @@ -0,0 +1,44 @@ +[ + { + "name": "camborne", + "wmo": "03808", + "alt": 87, + "lat": 50.21841, + "lon": -5.32753 + }, + { + "name": "heathrow", + "wmo": "03772", + "alt": 25, + "lat": 51.47922, + "lon": -0.45061 + }, + { + "name": "lerwick", + "wmo": "03005", + "alt": 82, + "lat": 60.13893, + "lon": -1.18491 + }, + { + "name": "aldergrove", + "wmo": "03917", + "alt": 63, + "lat": 54.66365, + "lon": -6.22534 + }, + { + "name": "exeter", + "wmo": "03844", + "alt": 27, + "lat": 50.73717, + "lon": -3.40579 + }, + { + "name": "middle_wallop", + "wmo": "03749", + "alt": 90, + "lat": 51.14987, + "lon": -1.56994 + } +]