diff --git a/README.md b/README.md index d648eb61..595e1382 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,29 @@ There are also four optional flags available: ## Adding a new dataset -### Step 1: Implement archiver interface +### Step 1: Define the dataset's metadata +For each dataset we archive, we record information about the title, a description, who +contributed to archiving the dataset, the segments into which the data files are +partitioned, its license and keywords. This +information is used to communicate about the dataset's usage and provenance to any +future users. + +* Title: The title of your dataset should clearly contain the agency publishing the data and a non-abbreviated title (e.g., EIA Manufacturing Energy Consumption Survey, not EIA MECS). +* Path: The link to the dataset's "homepage", where information about the dataset and the path to download it can be found. +* Working partitions: A dictionary where the key is the name of the partition (e.g., month, year, form), and the values are the actual available partitions (e.g., 2002-2020). +* License: We only archive data with an open source license (e.g., US Government Works or a Creative Commons License), so make sure any data you're archiving is licensed for re-distribution. +* Keywords: Words that someone might use to search for this dataset. These are used to help people find our data on Zenodo. + +If your dataset will be integrated directly into +[PUDL](https://github.com/catalyst-cooperative/pudl), you'll need to add the metadata +for the dataset into the PUDL repository in the `SOURCES` dictionary in +`src.pudl.metadata.sources.py`. + +If you aren't sure, or you're archiving data that won't go into PUDL, you'll want to +add your metadata as an entry into the `NON_PUDL_SOURCES` dictionary in +`src/pudl_archiver/metadata/sources.py`. + +### Step 2: Implement archiver interface All of the archivers inherit from the `AbstractDatasetArchiver` base class (defined in `src/pudl_archiver/archiver/classes.py`. There is only a single method that each @@ -130,7 +152,7 @@ hyperlinks matching the pattern on the page pointed to by the URL. This is usefu there's a page containing links to a series of data resources that have somewhat structured names. -### Step 2: Run --initialize command +### Step 3: Run --initialize command You will need to run the initialize command to create a new zenodo deposition, and update the config file with the new DOI: @@ -145,7 +167,7 @@ require you to create your own [Zenodo validation credentials](https://zenodo.org/account/settings/applications/tokens/new/) if you are not a core Catalyst developer. -### Step 3: Manually review your archive before publication. +### Step 4: Manually review your archive before publication. If the archiver run is successful, it will produce a link to the draft archive. Though many of the validation steps are automated, it is worthwhile manually reviewing archives diff --git a/src/pudl_archiver/archivers/eia/mecs.py b/src/pudl_archiver/archivers/eia/eiamecs.py similarity index 98% rename from src/pudl_archiver/archivers/eia/mecs.py rename to src/pudl_archiver/archivers/eia/eiamecs.py index cb2b5d41..c3ef0fbf 100644 --- a/src/pudl_archiver/archivers/eia/mecs.py +++ b/src/pudl_archiver/archivers/eia/eiamecs.py @@ -16,7 +16,7 @@ class EiaMECSArchiver(AbstractDatasetArchiver): """EIA MECS archiver.""" - name = "mecs" + name = "eiamecs" async def get_resources(self) -> ArchiveAwaitable: """Download EIA-MECS resources.""" diff --git a/src/pudl_archiver/depositors/zenodo/entities.py b/src/pudl_archiver/depositors/zenodo/entities.py index e3bddbee..4b91a10d 100644 --- a/src/pudl_archiver/depositors/zenodo/entities.py +++ b/src/pudl_archiver/depositors/zenodo/entities.py @@ -9,9 +9,11 @@ from typing import Annotated, Literal from pudl.metadata.classes import Contributor, DataSource +from pudl.metadata.sources import SOURCES from pydantic import BaseModel, Field, StringConstraints, field_validator from pudl_archiver.depositors.depositor import DepositionState +from pudl_archiver.metadata.sources import NON_PUDL_SOURCES from pudl_archiver.utils import Url logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -112,7 +114,9 @@ def check_empty_string(cls, doi: str): # noqa: N805 @classmethod def from_data_source(cls, data_source_id: str) -> "DepositionMetadata": """Construct deposition metadata object from PUDL DataSource model.""" - data_source = DataSource.from_id(data_source_id) + # Identify whether metadata originates from PUDL or archiver repo + sources = SOURCES if data_source_id in SOURCES else NON_PUDL_SOURCES + data_source = DataSource.from_id(data_source_id, sources=sources) creators = [ DepositionCreator.from_contributor(contributor) for contributor in data_source.contributors diff --git a/src/pudl_archiver/frictionless.py b/src/pudl_archiver/frictionless.py index 34796d9f..6ea62b13 100644 --- a/src/pudl_archiver/frictionless.py +++ b/src/pudl_archiver/frictionless.py @@ -9,8 +9,10 @@ from pudl.metadata.classes import Contributor, DataSource, License from pudl.metadata.constants import CONTRIBUTORS +from pudl.metadata.sources import SOURCES from pydantic import BaseModel, Field, field_serializer +from pudl_archiver.metadata.sources import NON_PUDL_SOURCES from pudl_archiver.utils import Url MEDIA_TYPES: dict[str, str] = { @@ -149,9 +151,13 @@ def new_datapackage( containing the local path to the resource, and its working partitions. version: Version string for current deposition version. """ - if name == "mecs": - return cls.mecs(resources=resources, version=version) - return cls.from_pudl_metadata(name=name, resources=resources, version=version) + if name in SOURCES: # If data source in PUDL source metadata + return cls.from_pudl_metadata( + name=name, resources=resources, version=version + ) + return cls.from_non_pudl_metadata( + name=name, resources=resources, version=version + ) @classmethod def from_pudl_metadata( @@ -161,7 +167,7 @@ def from_pudl_metadata( version: str | None, ) -> "DataPackage": """Create a datapackage using PUDL metadata associated with ``name``.""" - data_source = DataSource.from_id(name) + data_source = DataSource.from_id(name, sources=SOURCES) return DataPackage( name=f"pudl-raw-{data_source.name}", @@ -177,30 +183,25 @@ def from_pudl_metadata( ) @classmethod - def mecs(cls, resources: Iterable[Resource], version: str | None): - """Hack method to create a Datapackage for EIA MECS data not in PUDL metadata.""" + def from_non_pudl_metadata( + cls, + name: str, + resources: Iterable[Resource], + version: str | None, + ): + """Create a datapackage for sources that won't end up in PUDL.""" + data_source = DataSource.from_id(name, sources=NON_PUDL_SOURCES) + return DataPackage( - name="MECS", - title="EIA MECS data", - sources=[ - { - "title": "EIA MECS data", - "path": "https://www.eia.gov/consumption/manufacturing/data/2018/", - } - ], - licenses=[ - License( - **{ - "name": "CC-BY-4.0", - "title": "Creative Commons Attribution 4.0", - "path": "https://creativecommons.org/licenses/by/4.0", - } - ) - ], + name=data_source.name, + title=data_source.title, + sources=[{"title": data_source.title, "path": str(data_source.path)}], + licenses=[data_source.license_raw], resources=sorted(resources, key=lambda x: x.name), # Sort by filename - contributors=[CONTRIBUTORS["catalyst-cooperative"]], - created=str(datetime.datetime.now(tz=datetime.UTC).isoformat()), - keywords=["MECS"], - description="According to the EIA, the Manufacturing Energy Consumption Survey (MECS) is a national sample survey that collects information on the stock of U.S. manufacturing establishment, their energy-related building characteristics, and their energy consumption and expenditures.", + # Make it easier to add contributors directly into source dictionary + contributors=data_source.contributors, + created=datetime.datetime.now(tz=datetime.UTC).isoformat(), + keywords=data_source.keywords, + description=data_source.description, version=version, ) diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py new file mode 100644 index 00000000..db3bc8c9 --- /dev/null +++ b/src/pudl_archiver/metadata/sources.py @@ -0,0 +1,47 @@ +"""Metadata and operational constants.""" + +from typing import Any + +from pudl.metadata.constants import CONTRIBUTORS, LICENSES + +# To add a new contributor, follow the following format to add an entry to the +# ADDL_CONTRIBUTORS dictionary below formatted like this: +# "name-shorthand": { +# "title": "Catalyst Cooperative", +# "email": "pudl@catalyst.coop", +# "path": "https://catalyst.coop", +# "role": "publisher", +# "zenodo_role": "distributor", +# "organization": "Catalyst Cooperative", +# "orcid": "0000-1234-5678-9101" +# } +# Note that the only required fields are title (your name) and path +# (e.g., a link to your Github account, your ORCID site or a personal webpage), but +# filling other fields is strongly encouraged! +ADDL_CONTRIBUTORS: dict[str, dict[str, str]] = {} + +NON_PUDL_SOURCES: dict[str, Any] = { + "eiamecs": { + "title": "EIA Manufacturing Energy Consumption Survey", + "path": "https://www.eia.gov/consumption/manufacturing/data/2018/", + "description": ( + "EIA Form 846 A and B is more commonly known as the Manufacturing Energy" + "Consumption Survey (MECS). MECS is a national sample survey that collects" + "information on the stock of U.S. manufacturing establishment, their" + "energy-related building characteristics, and their energy consumption" + "and expenditures. MECS is conducted every four years." + ), + "working_partitions": { + "years": [1991, 1994, 1998, 2002, 2006, 2010, 2014, 2018] + }, # Census DP1 is monolithic. + "keywords": sorted( + { + "manufacturing", + "MECS", + } + ), + "license_raw": LICENSES["us-govt"], + "license_pudl": LICENSES["cc-by-4.0"], + "contributors": [CONTRIBUTORS["catalyst-cooperative"]], + }, +}