From e724bd7819fb78da2cf57b563116def5a68d4a1d Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 7 Jan 2025 10:37:47 -0500 Subject: [PATCH 1/4] Move MECS to metadata sources dictionary, add non-pudl-metadata method --- src/pudl_archiver/frictionless.py | 53 ++++++++++++++------------- src/pudl_archiver/metadata/sources.py | 38 +++++++++++++++++++ 2 files changed, 65 insertions(+), 26 deletions(-) create mode 100644 src/pudl_archiver/metadata/sources.py diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py index 34796d9f..31554bef 100644 --- a/src/pudl_archiver/frictionless.py +++ b/src/pudl_archiver/frictionless.py @@ -9,8 +9,10 @@ from pudl.metadata.classes import Contributor, DataSource, License from pudl.metadata.constants import CONTRIBUTORS +from pudl.metadata.sources import SOURCES from pydantic import BaseModel, Field, field_serializer +from pudl_archiver.metadata.sources import NON_PUDL_SOURCES from pudl_archiver.utils import Url MEDIA_TYPES: dict[str, str] = { @@ -149,9 +151,13 @@ def new_datapackage( containing the local path to the resource, and its working partitions. version: Version string for current deposition version. """ - if name == "mecs": - return cls.mecs(resources=resources, version=version) - return cls.from_pudl_metadata(name=name, resources=resources, version=version) + if name in SOURCES: # If data source in PUDL source metadata + return cls.from_pudl_metadata( + name=name, resources=resources, version=version + ) + return cls.from_non_pudl_metadata( + name=name, resources=resources, version=version + ) @classmethod def from_pudl_metadata( @@ -177,30 +183,25 @@ def from_pudl_metadata( ) @classmethod - def mecs(cls, resources: Iterable[Resource], version: str | None): - """Hack method to create a Datapackage for EIA MECS data not in PUDL metadata.""" + def from_non_pudl_metadata( + cls, + name: str, + resources: Iterable[Resource], + version: str | None, + ): + """Create a datapackage for sources that won't end up in PUDL.""" + data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES) + return DataPackage( - name="MECS", - title="EIA MECS data", - sources=[ - { - "title": "EIA MECS data", - "path": "https://www.eia.gov/consumption/manufacturing/data/2018/", - } - ], - licenses=[ - License( - **{ - "name": "CC-BY-4.0", - "title": "Creative Commons Attribution 4.0", - "path": "https://creativecommons.org/licenses/by/4.0", - } - ) - ], + name=data_source.name, + title=data_source.title, + sources=[{"title": data_source.title, "path": str(data_source.path)}], + licenses=[data_source.license_raw], resources=sorted(resources, key=lambda x: x.name), # Sort by filename - contributors=[CONTRIBUTORS["catalyst-cooperative"]], - created=str(datetime.datetime.now(tz=datetime.UTC).isoformat()), - keywords=["MECS"], - description="According to the EIA, the Manufacturing Energy Consumption Survey (MECS) is a national sample survey that collects information on the stock of U.S. manufacturing establishment, their energy-related building characteristics, and their energy consumption and expenditures.", + # Make it easier to add contributors directly into source dictionary + contributors=data_source.contributors, + created=datetime.datetime.now(tz=datetime.UTC).isoformat(), + keywords=data_source.keywords, + description=data_source.description, version=version, ) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py new file mode 100644 index 00000000..49f5fdab --- /dev/null +++ b/src/pudl_archiver/metadata/sources.py @@ -0,0 +1,38 @@ +"""Metadata and operational constants.""" + +from typing import Any + +from pudl.metadata.constants import CONTRIBUTORS, LICENSES + +NON_PUDL_SOURCES: dict[str, Any] = { + "eiamecs": { + "title": "EIA Manufacturing Energy Consumption Survey", + "path": "https://www.eia.gov/consumption/manufacturing/data/2018/", + "description": ( + "EIA Form 846 A and B is more commonly known as the Manufacturing Energy", + "Consumption Survey (MECS). MECS is a national sample survey that collects", + "information on the stock of U.S. manufacturing establishment, their", + "energy-related building characteristics, and their energy consumption", + "and expenditures. MECS is conducted every four years.", + ), + "working_partitions": { + 1991, + 1994, + 1998, + 2002, + 2006, + 2010, + 2014, + 2018, + }, # Census DP1 is monolithic. + "keywords": sorted( + { + "manufacturing", + "MECS", + } + ), + "license_raw": LICENSES["us-govt"], + "license_pudl": LICENSES["cc-by-4.0"], + "contributors": [CONTRIBUTORS["catalyst-cooperative"]], + }, +} From e2a4fb50d5f3bbd33f8ba500ac483ff94bb7f13d Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 8 Jan 2025 11:48:18 -0500 Subject: [PATCH 2/4] Make archiving flow work, rename to eiamecs, update README --- README.md | 28 +++++++++++++-- .../archivers/eia/{mecs.py => eiamecs.py} | 2 +- .../depositors/zenodo/entities.py | 10 +++++- src/pudl_archiver/frictionless.py | 6 +++- src/pudl_archiver/metadata/sources.py | 35 ++++++++++++------- 5 files changed, 62 insertions(+), 19 deletions(-) rename src/pudl_archiver/archivers/eia/{mecs.py => eiamecs.py} (98%) diff --git a/README.md b/README.md index d648eb61..595e1382 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,29 @@ There are also four optional flags available: ## Adding a new dataset -### Step 1: Implement archiver interface +### Step 1: Define the dataset's metadata +For each dataset we archive, we record information about the title, a description, who +contributed to archiving the dataset, the segments into which the data files are +partitioned, its license and keywords. This +information is used to communicate about the dataset's usage and provenance to any +future users. + +* Title: The title of your dataset should clearly contain the agency publishing the data and a non-abbreviated title (e.g., EIA Manufacturing Energy Consumption Survey, not EIA MECS). +* Path: The link to the dataset's "homepage", where information about the dataset and the path to download it can be found. +* Working partitions: A dictionary where the key is the name of the partition (e.g., month, year, form), and the values are the actual available partitions (e.g., 2002-2020). +* License: We only archive data with an open source license (e.g., US Government Works or a Creative Commons License), so make sure any data you're archiving is licensed for re-distribution. +* Keywords: Words that someone might use to search for this dataset. These are used to help people find our data on Zenodo. + +If your dataset will be integrated directly into +[PUDL](https://github.com/catalyst-cooperative/pudl), you'll need to add the metadata +for the dataset into the PUDL repository in the `SOURCES` dictionary in +`src.pudl.metadata.sources.py`. + +If you aren't sure, or you're archiving data that won't go into PUDL, you'll want to +add your metadata as an entry into the `NON_PUDL_SOURCES` dictionary in +`src/pudl_archiver/metadata/sources.py`. + +### Step 2: Implement archiver interface All of the archivers inherit from the `AbstractDatasetArchiver` base class (defined in `src/pudl_archiver/archiver/classes.py`. There is only a single method that each @@ -130,7 +152,7 @@ hyperlinks matching the pattern on the page pointed to by the URL. This is usefu there's a page containing links to a series of data resources that have somewhat structured names. -### Step 2: Run --initialize command +### Step 3: Run --initialize command You will need to run the initialize command to create a new zenodo deposition, and update the config file with the new DOI: @@ -145,7 +167,7 @@ require you to create your own [Zenodo validation credentials](https://zenodo.org/account/settings/applications/tokens/new/) if you are not a core Catalyst developer. -### Step 3: Manually review your archive before publication. +### Step 4: Manually review your archive before publication. If the archiver run is successful, it will produce a link to the draft archive. Though many of the validation steps are automated, it is worthwhile manually reviewing archives diff --git a/src/pudl_archiver/archivers/eia/mecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py similarity index 98% rename from src/pudl_archiver/archivers/eia/mecs.py rename to src/pudl_archiver/archivers/eia/eiamecs.py index cb2b5d41..c3ef0fbf 100644 --- a/src/pudl_archiver/archivers/eia/mecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -16,7 +16,7 @@ class EiaMECSArchiver(AbstractDatasetArchiver): """EIA MECS archiver.""" - name = "mecs" + name = "eiamecs" async def get_resources(self) -> ArchiveAwaitable: """Download EIA-MECS resources.""" diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py index e3bddbee..2b04ee71 100644 --- a/src/pudl_archiver/depositors/zenodo/entities.py +++ b/src/pudl_archiver/depositors/zenodo/entities.py @@ -9,9 +9,11 @@ from typing import Annotated, Literal from pudl.metadata.classes import Contributor, DataSource +from pudl.metadata.sources import SOURCES from pydantic import BaseModel, Field, StringConstraints, field_validator from pudl_archiver.depositors.depositor import DepositionState +from pudl_archiver.metadata.sources import NON_PUDL_SOURCES from pudl_archiver.utils import Url logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -112,7 +114,13 @@ def check_empty_string(cls, doi: str): # noqa: N805 @classmethod def from_data_source(cls, data_source_id: str) -> "DepositionMetadata": """Construct deposition metadata object from PUDL DataSource model.""" - data_source = DataSource.from_id(data_source_id) + # Identify whether metadata originates from PUDL or archiver repo + sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES + # TODO: This is a hacky workaround to having to update from_id to take sources + # in PUDL - should I just fix this at the source to make it more legible? + data_source = DataSource( + **DataSource.dict_from_id(x=data_source_id, sources=sources) + ) creators = [ DepositionCreator.from_contributor(contributor) for contributor in data_source.contributors diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py index 31554bef..c5912b6e 100644 --- a/src/pudl_archiver/frictionless.py +++ b/src/pudl_archiver/frictionless.py @@ -190,7 +190,11 @@ def from_non_pudl_metadata( version: str | None, ): """Create a datapackage for sources that won't end up in PUDL.""" - data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES) + # TODO: This is a slightly ugly workaround to avoid having to add sources into + # the from_id method - should I just fix this at the source? + data_source = DataSource( + **DataSource.dict_from_id(x=name, sources=NON_PUDL_SOURCES) + ) return DataPackage( name=data_source.name, diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 49f5fdab..db3bc8c9 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -4,26 +4,35 @@ from pudl.metadata.constants import CONTRIBUTORS, LICENSES +# To add a new contributor, follow the following format to add an entry to the +# ADDL_CONTRIBUTORS dictionary below formatted like this: +# "name-shorthand": { +# "title": "Catalyst Cooperative", +# "email": "pudl@catalyst.coop", +# "path": "https://catalyst.coop", +# "role": "publisher", +# "zenodo_role": "distributor", +# "organization": "Catalyst Cooperative", +# "orcid": "0000-1234-5678-9101" +# } +# Note that the only required fields are title (your name) and path +# (e.g., a link to your Github account, your ORCID site or a personal webpage), but +# filling other fields is strongly encouraged! +ADDL_CONTRIBUTORS: dict[str, dict[str, str]] = {} + NON_PUDL_SOURCES: dict[str, Any] = { "eiamecs": { "title": "EIA Manufacturing Energy Consumption Survey", "path": "https://www.eia.gov/consumption/manufacturing/data/2018/", "description": ( - "EIA Form 846 A and B is more commonly known as the Manufacturing Energy", - "Consumption Survey (MECS). MECS is a national sample survey that collects", - "information on the stock of U.S. manufacturing establishment, their", - "energy-related building characteristics, and their energy consumption", - "and expenditures. MECS is conducted every four years.", + "EIA Form 846 A and B is more commonly known as the Manufacturing Energy" + "Consumption Survey (MECS). MECS is a national sample survey that collects" + "information on the stock of U.S. manufacturing establishment, their" + "energy-related building characteristics, and their energy consumption" + "and expenditures. MECS is conducted every four years." ), "working_partitions": { - 1991, - 1994, - 1998, - 2002, - 2006, - 2010, - 2014, - 2018, + "years": [1991, 1994, 1998, 2002, 2006, 2010, 2014, 2018] }, # Census DP1 is monolithic. "keywords": sorted( { From d4f0acbcec2a958bafa0fc64fb5ffa0d7449a743 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 14 Jan 2025 18:03:34 -0500 Subject: [PATCH 3/4] Restore from_id() method --- src/pudl_archiver/depositors/zenodo/entities.py | 6 +----- src/pudl_archiver/frictionless.py | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py index 2b04ee71..4b91a10d 100644 --- a/src/pudl_archiver/depositors/zenodo/entities.py +++ b/src/pudl_archiver/depositors/zenodo/entities.py @@ -116,11 +116,7 @@ def from_data_source(cls, data_source_id: str) -> "DepositionMetadata": """Construct deposition metadata object from PUDL DataSource model.""" # Identify whether metadata originates from PUDL or archiver repo sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES - # TODO: This is a hacky workaround to having to update from_id to take sources - # in PUDL - should I just fix this at the source to make it more legible? - data_source = DataSource( - **DataSource.dict_from_id(x=data_source_id, sources=sources) - ) + data_source = DataSource.from_id(data_source_id, sources=sources) creators = [ DepositionCreator.from_contributor(contributor) for contributor in data_source.contributors diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py index c5912b6e..31554bef 100644 --- a/src/pudl_archiver/frictionless.py +++ b/src/pudl_archiver/frictionless.py @@ -190,11 +190,7 @@ def from_non_pudl_metadata( version: str | None, ): """Create a datapackage for sources that won't end up in PUDL.""" - # TODO: This is a slightly ugly workaround to avoid having to add sources into - # the from_id method - should I just fix this at the source? - data_source = DataSource( - **DataSource.dict_from_id(x=name, sources=NON_PUDL_SOURCES) - ) + data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES) return DataPackage( name=data_source.name, From ef9180e34b0cd57b6fa2b379941ef6f002846947 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 14 Jan 2025 18:05:32 -0500 Subject: [PATCH 4/4] Make from_pudl_metadata more obvious by specifying sources --- src/pudl_archiver/frictionless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py index 31554bef..6ea62b13 100644 --- a/src/pudl_archiver/frictionless.py +++ b/src/pudl_archiver/frictionless.py @@ -167,7 +167,7 @@ def from_pudl_metadata( version: str | None, ) -> "DataPackage": """Create a datapackage using PUDL metadata associated with ``name``.""" - data_source = DataSource.from_id(name) + data_source = DataSource.from_id(name, sources=SOURCES) return DataPackage( name=f"pudl-raw-{data_source.name}",